# Analysis of Code Switching in Vojna i Mir

In [1]:
import ast
import os
import pandas as pd

### Analyze CS types in the different volumes

In [2]:
output_dir = '../outputs/'
for f in sorted(os.listdir(output_dir)):
    if not f.startswith('cs_'):
        continue
    df = pd.read_csv(output_dir+f)
    print(f)
    # description of min, max, mean values
    print(df.describe())
    print(df.sum(numeric_only=True))
    # # sums of different CS types
    # deduped_df = df.drop_duplicates(subset=['volume', 'part', 'chapter', 'paragraph'])

    # # Summing the columns num_interturn and num_intersent
    # total_interturn = deduped_df['num_interturn'].sum()
    # total_intersent = deduped_df['num_intersent'].sum()
    # total_sentnumber = deduped_df['num_sent'].sum()

    # Displaying the sums
    print("Total num_interturn:", df['num_interturn'].sum())
    print("Total num_intersent:", df['num_intersent'].sum())
    print("Sentences containing intrasent", len(df[df['num_intrasent'].apply(lambda x: x > 0)]))
    print("Total num_intrasent:", df['num_intrasent'].sum())
    print("Majority language:", df['maj_lang'].value_counts())
    print("Embedded:", df['embedded'].value_counts())

cs_Том_1.csv
        Unnamed: 0         part      chapter    paragraph     num_sent  \
count  7599.000000  7599.000000  7599.000000  7599.000000  7599.000000   
mean   3799.000000     1.929991    10.854718    26.290038     5.263193   
std    2193.786681     0.824311     6.742233    22.451934     5.303398   
min       0.000000     1.000000     1.000000     0.000000     1.000000   
25%    1899.500000     1.000000     5.000000     9.000000     2.000000   
50%    3799.000000     2.000000    10.000000    22.000000     4.000000   
75%    5698.500000     3.000000    17.000000    38.000000     7.000000   
max    7598.000000     3.000000    24.000000   129.000000    33.000000   

       num_interturn  num_intersent     len_sent  num_intrasent  
count    7599.000000    7599.000000  7599.000000    7599.000000  
mean        0.038295       0.029346    17.971707       0.103961  
std         0.191919       0.168785    13.842232       0.455129  
min         0.000000       0.000000     1.000000       0

### Analyze linguistic features

In [5]:
feats = pd.read_csv('../outputs/features.csv')
for col in feats.columns:
    try:
        feats[col] = feats[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except Exception as e:
        print(col, e)


morph invalid syntax (<unknown>, line 1)
lang malformed node or string on line 1: <ast.Name object at 0x7f413d323940>
position malformed node or string on line 1: <ast.Name object at 0x7f413d41dc90>


In [6]:
feats.columns

Index(['Unnamed: 0', 'text', 'switch_len', 'pos', 'lemma', 'dep', 'morph',
       'lang', 'position'],
      dtype='object')

In [7]:
feats.describe()

Unnamed: 0.1,Unnamed: 0,switch_len
count,1273.0,1273.0
mean,636.0,7.428123
std,367.627756,7.78836
min,0.0,1.0
25%,318.0,2.0
50%,636.0,5.0
75%,954.0,10.0
max,1272.0,77.0


In [8]:
feats['lang'].value_counts()

lang
ru    698
fr    427
en     28
de     22
it     12
pt     12
no     11
et     10
ca      9
es      8
tl      8
af      7
ro      4
id      3
vi      3
sk      2
fi      2
hr      2
lv      1
tr      1
sl      1
nl      1
sq      1
Name: count, dtype: int64

In [9]:
feats['switch_len'].value_counts()

switch_len
2     203
3     153
1     151
5     105
4     102
6      72
7      59
9      56
8      51
11     40
13     33
10     31
12     30
16     21
14     18
15     17
18     13
19     11
22     10
23     10
21     10
25      9
26      8
28      7
20      7
17      7
31      6
27      5
29      4
24      4
32      3
39      2
44      2
35      2
42      2
38      1
30      1
54      1
50      1
43      1
52      1
36      1
48      1
77      1
Name: count, dtype: int64

#### CS instances of length 1

In [10]:
feats[feats['switch_len'] == 1]['pos'].value_counts()

pos
[PROPN]    55
[NOUN]     36
[ADJ]      16
[VERB]      8
[PART]      8
[DET]       7
[ADV]       6
[CCONJ]     5
[ADP]       5
[PRON]      2
[AUX]       1
[NUM]       1
[INTJ]      1
Name: count, dtype: int64

In [13]:
feats[feats['switch_len'] == 1]['lang'].value_counts()

lang
ru    60
fr    39
no    11
de     8
tl     6
ca     5
en     5
et     3
it     2
es     2
id     2
ro     2
sk     1
pt     1
vi     1
af     1
lv     1
hr     1
Name: count, dtype: int64

In [11]:
feats[(feats['switch_len'] == 1) & (feats['lang'] == 'ru')]['pos'].value_counts()

pos
[PROPN]    19
[PART]      8
[NOUN]      6
[VERB]      6
[CCONJ]     5
[ADP]       5
[ADV]       5
[ADJ]       2
[DET]       2
[NUM]       1
[INTJ]      1
Name: count, dtype: int64

In [12]:
feats[(feats['switch_len'] == 1) & (feats['lang'] == 'fr')]['pos'].value_counts()

pos
[PROPN]    21
[NOUN]      9
[ADJ]       6
[ADV]       1
[AUX]       1
[VERB]      1
Name: count, dtype: int64

In [84]:
feats[feats['switch_len'] == 1]['morph'].value_counts()

morph
[]                                                                                    18
[Number=Sing]                                                                          7
[Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing]                                        6
[Gender=Fem|Number=Sing]                                                               6
[Gender=Masc|Number=Sing]                                                              3
[Foreign=Yes]                                                                          2
[Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing]                                         1
[Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing]                                         1
[Gender=Masc|Number=Sing|PronType=Dem]                                                 1
[Aspect=Perf|Case=Nom|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass]     1
[Case=Nom|Gender=Neut|Number=Sing]                                                     1
[Aspect=Imp|Moo

#### CS instances of length 2

In [19]:
feats[feats['switch_len'] == 2]['dep'].value_counts()

dep
[det, ROOT]          68
[ROOT, flat:name]    30
[ROOT, nsubj]        27
[ROOT, ROOT]         13
[ROOT, obj]           9
[amod, ROOT]          9
[case, ROOT]          7
[cc, ROOT]            7
[ROOT, advmod]        5
[ROOT, nmod]          4
[ROOT, dep]           3
[advmod, ROOT]        3
[ROOT, punct]         3
[ROOT, appos]         3
[mo, ROOT]            1
[discourse, ROOT]     1
[expl, ROOT]          1
[parataxis, ROOT]     1
[nk, ROOT]            1
[nummod, ROOT]        1
[ROOT, cj]            1
[ROOT, amod]          1
[ROOT, xcomp]         1
[det, obj]            1
[nsubj, ROOT]         1
[ROOT, uc]            1
Name: count, dtype: int64

In [14]:
feats[feats['switch_len'] == 2]['pos'].value_counts()

pos
[DET, NOUN]       48
[NOUN, PROPN]     31
[VERB, PRON]      16
[X, PROPN]        10
[VERB, NOUN]      10
[VERB, PROPN]      8
[ADJ, NOUN]        7
[PROPN, NUM]       7
[NOUN, ADJ]        6
[CCONJ, ADV]       4
[ADJ, NUM]         4
[PROPN, NOUN]      4
[PROPN, PROPN]     4
[VERB, ADJ]        3
[NOUN, NOUN]       3
[ADP, NOUN]        3
[DET, ADJ]         3
[VERB, ADV]        3
[PROPN, VERB]      2
[PART, VERB]       2
[ADV, VERB]        2
[ADP, PROPN]       2
[CCONJ, ADJ]       1
[VERB, ADP]        1
[NOUN, X]          1
[PART, PRON]       1
[PART, NOUN]       1
[CCONJ, PROPN]     1
[PROPN, DET]       1
[ADP, ADV]         1
[CCONJ, PRON]      1
[PART, CCONJ]      1
[ADV, ADV]         1
[ADP, PRON]        1
[ADJ, PROPN]       1
[DET, NUM]         1
[ADJ, VERB]        1
[PART, DET]        1
[VERB, CCONJ]      1
[VERB, VERB]       1
[PRON, PROPN]      1
[PRON, NOUN]       1
[VERB, PART]       1
Name: count, dtype: int64

In [15]:
feats[(feats['switch_len'] == 2) & (feats['pos'].apply(lambda x: not any(pos not in ['NOUN', 'PROPN'] for pos in x)))]


Unnamed: 0.1,Unnamed: 0,text,switch_len,pos,lemma,dep,morph,lang,position
200,200,"[—, Capital, !, —]",2,"[PROPN, NOUN]","[Capital, —]","[ROOT, ROOT]","[, NumType=Card]",ca,bos
266,266,"[—, Lise, !, —]",2,"[PROPN, NOUN]","[Lise, —]","[ROOT, ROOT]","[, NumType=Card]",et,bos
332,332,"[comtesse, Apraksine, .]",2,"[NOUN, PROPN]","[comtesse, Apraksine]","[ROOT, obj]","[Gender=Fem|Number=Sing, ]",et,eos
476,476,"[plate, couture, ,]",2,"[NOUN, NOUN]","[plate, couture]","[ROOT, nmod]","[Gender=Masc|Number=Sing, Gender=Fem|Number=Sing]",fr,mid
526,526,"[parlons, raison, ,]",2,"[NOUN, NOUN]","[parlon, raison]","[ROOT, amod]","[Gender=Fem|Number=Plur, Gender=Fem|Number=Sing]",fr,mid
560,560,"[Анна, Михайловна, ,]",2,"[PROPN, PROPN]","[анна, михаилович]","[ROOT, appos]","[Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,...",ru,mid
567,567,"[Анна, Михайловна, ,]",2,"[PROPN, PROPN]","[анна, михаилович]","[ROOT, appos]","[Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,...",ru,mid
603,603,"[Анна, Михайловна, ,]",2,"[PROPN, PROPN]","[анна, михаилович]","[ROOT, appos]","[Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing,...",ru,mid
613,613,"[lle, Bourienne, ,]",2,"[NOUN, PROPN]","[lle, Bourienne]","[det, ROOT]","[Gender=Fem|Number=Sing, ]",fr,mid
635,635,"[Ah, ,, Marie, !, ..., —]",2,"[NOUN, NOUN]","[Ah, Marie]","[ROOT, cj]","[Case=Nom|Gender=Fem|Number=Sing, Case=Nom|Gen...",de,bos


In [16]:
feats[(feats['switch_len'] == 2) & (feats['pos'].apply(lambda x: not any(pos not in ['DET', 'NOUN'] for pos in x)))]

Unnamed: 0.1,Unnamed: 0,text,switch_len,pos,lemma,dep,morph,lang,position
34,34,"[des, imbéciles, .]",2,"[DET, NOUN]","[un, imbécile]","[det, ROOT]","[Definite=Ind|Number=Plur|PronType=Art, Gender...",fr,eos
38,38,"[Мои, дети, —]",2,"[DET, NOUN]","[мой, ребёнок]","[det, ROOT]","[Case=Nom|Number=Plur, Animacy=Anim|Case=Nom|G...",ru,bos
69,69,"[ma, tante, ,]",2,"[DET, NOUN]","[mon, tante]","[det, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",pt,mid
71,71,"[Ma, tante]",2,"[DET, NOUN]","[mon, tante]","[det, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",id,bos
93,93,"[ma, tante, ,]",2,"[DET, NOUN]","[mon, tante]","[det, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",pt,mid
151,151,"[ma, tante, ,]",2,"[DET, NOUN]","[mon, tante]","[det, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",pt,mid
214,214,"[une, dame, .]",2,"[DET, NOUN]","[un, dame]","[det, ROOT]",[Definite=Ind|Gender=Fem|Number=Sing|PronType=...,fr,eos
249,249,"[mon, cher, ,]",2,"[DET, NOUN]","[mon, cher]","[det, ROOT]","[Number=Sing|Poss=Yes, Gender=Masc|Number=Sing]",en,mid
288,288,"[les, femmes]",2,"[DET, NOUN]","[le, femme]","[det, ROOT]","[Definite=Def|Number=Plur|PronType=Art, Gender...",fr,mid
293,293,"[ma, chère]",2,"[DET, NOUN]","[mon, chère]","[det, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",fr,mid


In [17]:
feats[(feats['switch_len'] == 2) & (feats['lang'] == 'ru')]['pos'].value_counts()

pos
[VERB, PRON]      15
[VERB, NOUN]       9
[VERB, PROPN]      8
[CCONJ, ADV]       4
[VERB, ADV]        3
[PROPN, PROPN]     3
[ADJ, NOUN]        2
[DET, NOUN]        2
[PART, VERB]       2
[ADP, PROPN]       2
[CCONJ, PROPN]     1
[PROPN, DET]       1
[PART, PRON]       1
[PART, NOUN]       1
[VERB, ADJ]        1
[CCONJ, ADJ]       1
[VERB, ADP]        1
[CCONJ, PRON]      1
[PART, DET]        1
[ADP, PRON]        1
[ADV, VERB]        1
[ADJ, PROPN]       1
[VERB, CCONJ]      1
[VERB, VERB]       1
[PROPN, VERB]      1
[PART, CCONJ]      1
[ADP, NOUN]        1
[VERB, PART]       1
Name: count, dtype: int64

In [18]:
feats[(feats['switch_len'] == 2) & (feats['lang'] == 'fr')]['pos'].value_counts()

pos
[NOUN, PROPN]    30
[DET, NOUN]      29
[X, PROPN]       10
[NOUN, ADJ]       6
[PROPN, NUM]      4
[ADJ, NOUN]       3
[NOUN, NOUN]      2
[VERB, ADJ]       2
[ADP, NOUN]       2
[PRON, NOUN]      1
[ADV, VERB]       1
[ADJ, NUM]        1
[ADJ, VERB]       1
[ADP, ADV]        1
[PROPN, NOUN]     1
[NOUN, X]         1
[DET, ADJ]        1
Name: count, dtype: int64

#### CS instances of 3 tokens

In [21]:
feats[feats['switch_len'] == 3]['lang'].value_counts()

lang
ru    72
fr    54
en     7
pt     5
de     5
ca     2
it     2
sk     1
nl     1
et     1
ro     1
tl     1
af     1
Name: count, dtype: int64

In [20]:
feats[feats['switch_len'] == 3]['pos'].value_counts()

pos
[DET, NOUN, NUM]        11
[NOUN, ADP, NOUN]        8
[DET, NOUN, NOUN]        6
[VERB, NOUN, PROPN]      6
[DET, ADJ, NOUN]         5
                        ..
[DET, ADV, ADJ]          1
[DET, NOUN, PROPN]       1
[CCONJ, PROPN, PRON]     1
[ADV, NOUN, NUM]         1
[VERB, ADP, PRON]        1
Name: count, Length: 86, dtype: int64

In [23]:
feats[feats['switch_len'] == 3]['dep'].value_counts()

dep
[det, ROOT, ROOT]          13
[ROOT, case, nmod]          8
[det, amod, ROOT]           7
[det, ROOT, dep]            3
[cc, advmod, ROOT]          3
                           ..
[det, ROOT, nmod]           1
[cc, ROOT, ROOT]            1
[advmod, obl:mod, ROOT]     1
[ROOT, obj, conj]           1
[ROOT, case, obl]           1
Name: count, Length: 88, dtype: int64

In [22]:
feats[feats['switch_len'] == 3]

Unnamed: 0.1,Unnamed: 0,text,switch_len,pos,lemma,dep,morph,lang,position
4,4,"[мой, верный, раб, ,]",3,"[DET, ADJ, NOUN]","[мой, верный, раб]","[det, amod, ROOT]","[Case=Nom|Gender=Masc|Number=Sing, Case=Nom|De...",ru,mid
5,5,"[comme, vous, dites, .]",3,"[SCONJ, PRON, VERB]","[comme, vous, dire]","[mark, nsubj, ROOT]","[, Number=Plur|Person=2, Mood=Ind|Number=Plur|...",fr,eos
7,7,"[садитесь, и, рассказывайте, .]",3,"[VERB, CCONJ, VERB]","[садитесь, и, рассказывать]","[ROOT, cc, conj]",[Aspect=Perf|Mood=Imp|Number=Plur|Person=Secon...,ru,eos
13,13,"[A, propos, ,, —]",3,"[NOUN, NOUN, NUM]","[a, propos, —]","[case, ROOT, ROOT]","[, Gender=Masc, NumType=Card]",pt,bos
18,18,"[l’abbé, Morіo, :]",3,"[NOUN, ADJ, NOUN]","[l’, abbé, morіo]","[ROOT, punct, ROOT]","[Gender=Masc|Number=Sing, Number=Sing, Gender=...",ca,mid
...,...,...,...,...,...,...,...,...,...
1212,1212,"[mon, cher, ,, —]",3,"[DET, NOUN, NUM]","[mon, cher, —]","[det, ROOT, ROOT]","[Number=Sing|Poss=Yes, Gender=Masc|Number=Sing...",en,mid
1229,1229,"[—, Ma, foi, ,, —]",3,"[DET, NOUN, NUM]","[mon, foi, —]","[det, obj, ROOT]","[Gender=Fem|Number=Sing|Poss=Yes, Gender=Fem|N...",pt,bos
1238,1238,"[vive, l’empereur, !]",3,"[ADJ, ADJ, NOUN]","[vif, l’, empereur]","[amod, amod, ROOT]","[Gender=Fem|Number=Sing, Gender=Masc|Number=Si...",fr,mid
1239,1239,"[бежали, за, ним, .]",3,"[VERB, ADP, PRON]","[бежать, за, ним]","[ROOT, case, obl]",[Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|Ve...,ru,eos
