# Analysis of Code Switching in Vojna i Mir

In [45]:
import ast
import os
import pandas as pd

### Analyze CS types in the different volumes

In [42]:
output_dir = '../outputs/'
for f in sorted(os.listdir(output_dir)):
    if not f.startswith('cs_'):
        continue
    df = pd.read_csv(output_dir+f)
    print(f)
    # description of min, max, mean values
    print(df.describe())
    print(df.sum(numeric_only=True))
    # # sums of different CS types
    # deduped_df = df.drop_duplicates(subset=['volume', 'part', 'chapter', 'paragraph'])

    # # Summing the columns num_interturn and num_intersent
    # total_interturn = deduped_df['num_interturn'].sum()
    # total_intersent = deduped_df['num_intersent'].sum()
    # total_sentnumber = deduped_df['num_sent'].sum()

    # Displaying the sums
    print("Total num_interturn:", df['num_interturn'].sum())
    print("Total num_intersent:", df['num_intersent'].sum())
    print("Sentences containing intrasent", len(df[df['num_intrasent'].apply(lambda x: x > 0)]))
    print("Total num_intrasent:", df['num_intrasent'].sum())
    print("Majority language:", df['maj_lang'].value_counts())
    print("Embedded:", df['embedded'].value_counts())

cs_Том_1.csv
        Unnamed: 0         part      chapter    paragraph     num_sent  \
count  7599.000000  7599.000000  7599.000000  7599.000000  7599.000000   
mean   3799.000000     1.929991    10.854718    26.290038     5.263193   
std    2193.786681     0.824311     6.742233    22.451934     5.303398   
min       0.000000     1.000000     1.000000     0.000000     1.000000   
25%    1899.500000     1.000000     5.000000     9.000000     2.000000   
50%    3799.000000     2.000000    10.000000    22.000000     4.000000   
75%    5698.500000     3.000000    17.000000    38.000000     7.000000   
max    7598.000000     3.000000    24.000000   129.000000    33.000000   

       num_interturn  num_intersent     len_sent  num_intrasent  
count    7599.000000    7599.000000  7599.000000    7599.000000  
mean        0.038295       0.029346    17.971707       0.103961  
std         0.191919       0.168785    13.842232       0.455129  
min         0.000000       0.000000     1.000000       0

In [25]:
# display sentences with highest number of intrasent-switches
df[df['num_intrasent'] == 7]

Unnamed: 0.1,Unnamed: 0,volume,part,chapter,paragraph,num_sent,num_interturn,num_intersent,tokens,len_sent,cs,cs_indices,num_intrasent,maj_lang,first_lang,last_lang,embedded
125,125,Том_1,1,1,37,3,1,0,"['—', 'Ecoutez', ',', 'chère', 'Annette', ',',...",56,"['', 'non-ru', '', 'non-ru', 'non-ru', '', '',...","[7, 25, 34, 37, 44, 45, 47]",7,ru,non-ru,ru,False
4315,4315,Том_1,2,12,41,5,1,2,"['Nous', 'sommes', 'mackés', '.', '—', 'заключ...",27,"['non-ru', 'non-ru', 'non-ru', '', '', 'ru', '...","[5, 13, 16, 18, 20, 21, 23]",7,ru,non-ru,ru,False


### Analyze linguistic features

In [50]:
feats = pd.read_csv('../outputs/features.csv')
for col in feats.columns:
    try:
        feats[col] = feats[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except Exception as e:
        print(col, e)


morph invalid syntax (<unknown>, line 1)
lang malformed node or string on line 1: <ast.Name object at 0x7f1077bafbb0>
position malformed node or string on line 1: <ast.Name object at 0x7f1077baf910>


In [49]:
feats.columns

Index(['Unnamed: 0', 'text', 'switch_len', 'pos', 'lemma', 'dep', 'morph',
       'lang', 'position'],
      dtype='object')

In [51]:
feats.describe()

Unnamed: 0.1,Unnamed: 0,switch_len
count,1273.0,1273.0
mean,636.0,9.378633
std,367.627756,9.152426
min,0.0,1.0
25%,318.0,3.0
50%,636.0,6.0
75%,954.0,12.0
max,1272.0,85.0


In [64]:
feats['lang'].value_counts()

lang
ru    698
fr    427
en     29
de     22
pt     15
no     11
ca      9
et      9
tl      8
es      8
it      8
ro      5
af      5
id      3
vi      3
nl      2
hu      2
sl      2
fi      2
hr      2
tr      1
pl      1
lv      1
Name: count, dtype: int64

In [65]:
feats['switch_len'].value_counts()

switch_len
3     187
4     115
2     114
6      94
5      92
7      88
11     60
8      58
1      49
9      47
10     38
13     35
14     33
12     28
18     27
16     25
15     23
25     12
17     12
22     12
19     10
21     10
31      9
20      8
23      8
28      7
26      7
30      6
24      6
34      5
32      5
29      5
36      5
35      5
39      4
49      3
27      3
37      3
47      2
59      2
60      1
54      1
33      1
46      1
56      1
38      1
48      1
58      1
40      1
42      1
85      1
Name: count, dtype: int64

In [54]:
feats[feats['switch_len'] == 1]['pos'].value_counts()

pos
[PROPN]    18
[NOUN]      9
[DET]       6
[CCONJ]     5
[ADP]       4
[ADJ]       4
[VERB]      2
[PRON]      1
Name: count, dtype: int64

In [55]:
feats[feats['switch_len'] == 1]['dep'].value_counts()

dep
[ROOT]    49
Name: count, dtype: int64

In [56]:
feats[feats['switch_len'] == 1]['lang'].value_counts()

lang
ru    22
fr    10
ca     4
et     2
pt     2
no     2
en     2
es     1
hr     1
it     1
tl     1
id     1
Name: count, dtype: int64

In [62]:
feats[(feats['switch_len'] == 2) & (feats['lang'] == 'ru')]['pos'].value_counts()

pos
[PROPN, PUNCT]    7
[NOUN, PUNCT]     4
[VERB, PUNCT]     2
[ADP, PROPN]      2
[PART, PUNCT]     2
[PUNCT, PROPN]    2
[PUNCT, ADP]      1
[ADJ, PUNCT]      1
[ADV, PUNCT]      1
[CCONJ, ADV]      1
[VERB, CCONJ]     1
[PUNCT, NUM]      1
[ADJ, PROPN]      1
[INTJ, PUNCT]     1
[DET, PUNCT]      1
[CCONJ, ADJ]      1
[VERB, ADJ]       1
[PUNCT, PART]     1
[VERB, ADP]       1
Name: count, dtype: int64

In [63]:
feats[(feats['switch_len'] == 2) & (feats['lang'] == 'fr')]['pos'].value_counts()

pos
[NOUN, PROPN]        17
[PROPN, PUNCT]       16
[NOUN, PUNCT]         6
[DET, NOUN]           6
[ADJ, PUNCT]          2
[NOUN, VERB, ADP]     1
[PUNCT, ADJ]          1
[PRON, PROPN]         1
[AUX, PUNCT]          1
[VERB, PUNCT]         1
[ADV, VERB]           1
Name: count, dtype: int64