In [None]:
%%capture

!pip install nltk conllu
import nltk
import conllu

In [None]:
import csv
from nltk.parse.dependencygraph import DependencyGraph
import pandas as pd 
import spacy




In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_train = "/content/drive/MyDrive/Ru_Syntagrus_Dataset/full_annotated_sentence.conllu"

Mounted at /content/drive


In [None]:
# Read the UD Russian SynTagRus dataset in CONLU format
with open(data_train, "r") as file:
    sentences = conllu.parse(file.read())

# Calculate the dependency distance for each sentence, which is the number of words between two words that have a dependency relationship.

In [None]:
%%capture

!pip3 install spacy
!python3 -m spacy download ru_core_news_sm

In [None]:


# Load the spaCy Russian language model
nlp = spacy.load("ru_core_news_sm")

In [None]:
# Initialize an empty list to store the features for each sentence
features = []

# Loop through each sentence in the dataset
for sentence in sentences:
    # Parse the sentence using spaCy
    doc = nlp(sentence.metadata["text"])
    
    # Initialize an empty list to store the dependency distances for each word in the sentence
    distances = []
    
    # Loop through each token in the sentence
    for token in doc:
        # Calculate the dependency distance for each child of the token
        for child in token.children:
            distance = abs(child.i - token.i) - 1
            distances.append(distance)
    
    # Calculate the average and maximum path lengths for the sentence
    avg_path_length = sum(distances) / len(distances) if distances else 0
    max_path_length = max(distances) if distances else 0
    
    # Append the features for this sentence to the list of features
    features.append({
        "text": sentence.metadata["text"],
        "dependency_distance": distances,
        "average_path_length": avg_path_length,
        "maximum_path_length": max_path_length
    })


In [None]:
# Save the features for each sentence to a CSV file
with open("dependency_dist.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["text", "dependency_distance", "average_path_length", "maximum_path_length"])
    writer.writeheader()
    for feature in features:
        writer.writerow(feature)

In [None]:
sent_feat = pd.read_csv('dependency_dist.csv')
sent_feat.head()

Unnamed: 0,text,dependency_distance,average_path_length,maximum_path_length
0,За неделю начальник управления собирался позво...,"[0, 0, 2, 1, 0, 24, 0, 18, 2, 1, 0, 0, 3, 0, 1...",2.275862,24
1,На этот раз она была в сильном возбуждении и м...,"[1, 0, 4, 3, 2, 1, 0, 1, 6, 0, 1, 0, 1, 0]",1.428571,6
2,"Ребята, которые посмелей, конечно, уехали на к...","[6, 18, 0, 0, 5, 4, 3, 1, 1, 9, 0, 2, 1, 0, 3,...",2.947368,18
3,Да и работать она может только в системе связи...,"[0, 3, 1, 0, 2, 6, 9, 1, 0, 0, 1, 0, 1, 0]",1.714286,9
4,"Утром он поднялся с головной болью и, не позав...","[1, 0, 2, 8, 11, 1, 0, 1, 0, 0, 4, 1, 1, 0]",2.142857,11


In [None]:
sent_feat.shape

(1200, 4)

In [None]:
sent_feat.to_csv('dependency_dist_feat.csv', index = False)

# Measure the syntactic complexity of each sentence by counting the number of clauses, phrases, and subordinating conjunctions.

In [None]:
# Initialize an empty list to store the features for each sentence
features = []

# Loop through each sentence in the dataset
for sentence in sentences:
    # Parse the sentence using spaCy
    doc = nlp(sentence.metadata["text"])
    
    # Count the number of clauses, phrases, and subordinating conjunctions in the sentence
    num_clauses = 0
    num_phrases = 0
    num_subordinating_conjunctions = 0
    for token in doc:
        if token.dep_ == "acl" or token.dep_ == "advcl" or token.dep_ == "ccomp":
            num_clauses += 1
        elif token.dep_.startswith("obl") or token.dep_.startswith("iobj") or token.dep_.startswith("obj") or token.dep_.startswith("nsubj") or token.dep_.startswith("csubj"):
            num_phrases += 1
        elif token.pos_ == "SCONJ":
            num_subordinating_conjunctions += 1
    
    # Append the features for this sentence to the list of features
    features.append({
        "text": sentence.metadata["text"],
        "num_clauses": num_clauses,
        "num_phrases": num_phrases,
        "num_subordinating_conjunctions": num_subordinating_conjunctions
    })

In [None]:
# Save the features for each sentence to a CSV file
with open("syntactic_complexity.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["text", "num_clauses", "num_phrases", "num_subordinating_conjunctions"])
    writer.writeheader()
    for feature in features:
        writer.writerow(feature)

In [None]:
syntactic_complexity_feat = pd.read_csv('syntactic_complexity.csv')
syntactic_complexity_feat.head()

Unnamed: 0,text,num_clauses,num_phrases,num_subordinating_conjunctions
0,За неделю начальник управления собирался позво...,1,6,1
1,На этот раз она была в сильном возбуждении и м...,0,2,0
2,"Ребята, которые посмелей, конечно, уехали на к...",0,4,0
3,Да и работать она может только в системе связи...,0,3,0
4,"Утром он поднялся с головной болью и, не позав...",1,4,0


In [None]:
syntactic_complexity_feat.to_csv("syntactic_complexity_feat.csv", index = False)

# Features based on POS-tags

In [None]:

features = []

# Loop through each sentence in the dataset
for sentence in sentences:
    # Parse the sentence using spaCy
    doc = nlp(sentence.metadata["text"])
    
    # Initialize counters for each type of part of speech and case
    num_nouns = 0
    num_verbs = 0
    num_adjectives = 0
    num_pronouns = 0
    num_nominative = 0
    num_genitive = 0
    total_words = 0
    
    # Loop through each token in the sentence
    for token in doc:
        total_words += 1
        # Check the part-of-speech tag and increment the corresponding counter
        if token.pos_ == "NOUN":
            num_nouns += 1
        elif token.pos_ == "VERB":
            num_verbs += 1
        elif token.pos_ == "ADJ":
            num_adjectives += 1
        elif token.pos_ == "PRON" and token.dep_ != "nsubj":
            num_pronouns += 1
        
        # Check the case and increment the corresponding counter
        if token.tag_.endswith("nom"):
            num_nominative += 1
        elif token.tag_.endswith("gen"):
            num_genitive += 1
    
    # Compute the proportions of each type of part of speech and case
    prop_nouns = num_nouns / total_words
    prop_verbs = num_verbs / total_words
    prop_adjectives = num_adjectives / total_words
    prop_pronouns = num_pronouns / total_words
    prop_nominative = num_nominative / total_words
    prop_genitive = num_genitive / total_words
    
    # Append the features for this sentence to the list of features
    features.append({
        "text": sentence.metadata["text"],
        "prop_nouns": prop_nouns,
        "prop_verbs": prop_verbs,
        "prop_adjectives": prop_adjectives,
        "prop_pronouns": prop_pronouns,
        "prop_nominative": prop_nominative,
        "prop_genitive": prop_genitive
    })


In [None]:
# Save the features for each sentence to a CSV file
with open("POS-tags.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["text", "prop_nouns", "prop_verbs", "prop_adjectives", "prop_pronouns", "prop_nominative", "prop_genitive"])
    writer.writeheader()
    for feature in features:
        writer.writerow(feature)

In [None]:
POStags_feat = pd.read_csv('POS-tags.csv')
POStags_feat.head()

Unnamed: 0,text,prop_nouns,prop_verbs,prop_adjectives,prop_pronouns,prop_nominative,prop_genitive
0,За неделю начальник управления собирался позво...,0.2,0.133333,0.066667,0.0,0.0,0.0
1,На этот раз она была в сильном возбуждении и м...,0.266667,0.066667,0.133333,0.0,0.0,0.0
2,"Ребята, которые посмелей, конечно, уехали на к...",0.2,0.1,0.0,0.0,0.0,0.0
3,Да и работать она может только в системе связи...,0.2,0.133333,0.066667,0.066667,0.0,0.0
4,"Утром он поднялся с головной болью и, не позав...",0.2,0.2,0.066667,0.0,0.0,0.0


In [None]:
print(POStags_feat['prop_nominative'].unique())


[0.]


In [None]:
print(POStags_feat['prop_genitive'].unique())


[0.]


In [None]:
#print(POStags_feat['prop_pronouns'].unique())


In [None]:
POStags_feat = POStags_feat.drop(POStags_feat[["prop_nominative", "prop_genitive"]], axis = 1)
POStags_feat.head()

Unnamed: 0,text,prop_nouns,prop_verbs,prop_adjectives,prop_pronouns
0,За неделю начальник управления собирался позво...,0.2,0.133333,0.066667,0.0
1,На этот раз она была в сильном возбуждении и м...,0.266667,0.066667,0.133333,0.0
2,"Ребята, которые посмелей, конечно, уехали на к...",0.2,0.1,0.0,0.0
3,Да и работать она может только в системе связи...,0.2,0.133333,0.066667,0.066667
4,"Утром он поднялся с головной болью и, не позав...",0.2,0.2,0.066667,0.0


In [None]:
POStags_feat.to_csv("POS-tags_feat.csv", index= False)

# Merging all files

In [None]:
import pandas as pd

df1 = pd.read_csv("dependency_dist_feat.csv")
df2 = pd.read_csv("syntactic_complexity_feat.csv")
df3 = pd.read_csv("POS-tags_feat.csv")
merged_df = pd.concat([df1, df2, df3])

# Group the dataframe by the "text" column and compute the mean of each group
grouped_df = merged_df.groupby("text").mean().reset_index()

grouped_df.to_csv("extracted_features.csv", index=False)


In [None]:
feat = pd.read_csv('extracted_features.csv')
feat.head()

Unnamed: 0,text,average_path_length,maximum_path_length,num_clauses,num_phrases,num_subordinating_conjunctions,prop_nouns,prop_verbs,prop_adjectives,prop_pronouns
0,Автомобили появятся в салонах официальных диле...,1.777778,7.0,0.0,3.0,0.0,0.4,0.1,0.1,0.0
1,Автор сам бывший шахтер и очень хорошо описыва...,2.586207,25.0,1.0,6.0,0.0,0.233333,0.1,0.1,0.033333
2,Агент МИ6 работал паспортистом в посольстве Ве...,1.111111,6.0,0.0,1.0,0.0,0.3,0.1,0.0,0.0
3,Администрация города обратилась в прокуратуру ...,1.5,11.0,0.0,4.0,0.0,0.4,0.133333,0.0,0.0
4,Академик Аганбегян считает: в первую очередь с...,2.157895,16.0,0.0,4.0,0.0,0.35,0.15,0.05,0.0


In [None]:
feat.shape

(1200, 10)

In [None]:
#feat.to_csv('full_extracted_features.csv', index = False)

# Adding the number of tokenizations and the total frequency  for each sentence.

In [None]:
import nltk
nltk.download('punkt')
import pandas as pd


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
feat = pd.read_csv('full_extracted_features.csv')
freq_list = pd.read_csv('freqrnc2011.csv', delimiter='\t')

In [None]:
freq_list.head()

Unnamed: 0,Lemma,PoS,Freq(ipm),R,D,Doc
0,а,conj,8198.0,100,97,32332
1,а,intj,19.8,99,90,757
2,а,part,6.1,59,79,128
3,а,s,2.7,59,85,160
4,аа,intj,1.5,47,80,68


In [None]:
# Define a function to tokenize a sentence and count the number of tokens
def count_tokens(sentence):
    tokens = nltk.word_tokenize(sentence)
    return len(tokens)

In [None]:
frequency_list = freq_list.set_index('Lemma')['Freq(ipm)'].to_dict()
def get_word_freq(word):
    try:
        freq = frequency_list[word]
    except KeyError:
        freq = 0
    return freq

In [None]:
# Add columns for the number of tokens and the total frequency to the main dataframe
feat['num_tokens'] = feat['text'].apply(count_tokens)
feat['total_freq'] = feat['text'].apply(lambda x: sum(get_word_freq(word) for word in nltk.word_tokenize(x)))

In [None]:
feat.head()

Unnamed: 0,text,average_path_length,maximum_path_length,num_clauses,num_phrases,num_subordinating_conjunctions,prop_nouns,prop_verbs,prop_adjectives,prop_pronouns,num_tokens,total_freq
0,Автомобили появятся в салонах официальных диле...,1.777778,7.0,0.0,3.0,0.0,0.4,0.1,0.1,0.0,10,62748.4
1,Автор сам бывший шахтер и очень хорошо описыва...,2.586207,25.0,1.0,6.0,0.0,0.233333,0.1,0.1,0.033333,30,125001.2
2,Агент МИ6 работал паспортистом в посольстве Ве...,1.111111,6.0,0.0,1.0,0.0,0.3,0.1,0.0,0.0,10,62748.4
3,Администрация города обратилась в прокуратуру ...,1.5,11.0,0.0,4.0,0.0,0.4,0.133333,0.0,0.0,15,74447.2
4,Академик Аганбегян считает: в первую очередь с...,2.157895,16.0,0.0,4.0,0.0,0.35,0.15,0.05,0.0,20,99152.8


In [None]:
feat.shape

(1200, 12)

### Adding the average token length for each sentence

In [None]:
#import pandas as pd
#annotations_df = pd.read_csv('full_annotated_data.tsv')
#features_df = pd.read_csv('full_extracted_features.csv')

In [None]:
# Calculate the average token length for each sentence
avg_token_length = features_df['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
# Add the new column to the DataFrame
features_df['avg_token_length'] = avg_token_length



In [None]:
features_df.head(1)

Unnamed: 0,text,average_path_length,maximum_path_length,num_clauses,num_phrases,num_subordinating_conjunctions,prop_nouns,prop_verbs,prop_adjectives,prop_pronouns,num_tokens,total_freq,avg_token_length
0,Автомобили появятся в салонах официальных диле...,1.777778,7.0,0.0,3.0,0.0,0.4,0.1,0.1,0.0,10,62748.4,6.222222


In [None]:
#features_df.to_csv("full_extracted_features.csv", index= False)