In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data.csv')
df[:100]

Unnamed: 0.1,Unnamed: 0,word,lemma,pos_tag,freq
0,0,ამ,ეს,Pron,1041654
1,1,ზამთრის,ზამთარი,N,8035
2,2,მიწურულს,მიწურული,Dat,5976
3,3,თითქმის,თითქმის,Adv,62608
4,4,სასოწარკვეთილი,სასოწარკვეთილი,A,844
...,...,...,...,...,...
95,105,შუბლზე,შუბლი,N,1697
96,106,ვაკოცე,კოცნა,V,174
97,107,როგორც,როგორც,Cj,478016
98,108,ჩანს,*ჩენა,V,57172


In [40]:
df = df.drop(["Unnamed: 0"], axis=1)
df

In [5]:
df.query("word == 'თქმა'")

Unnamed: 0,word,lemma,pos_tag,freq
2978,თქმა,\t,<MWE>,82436


In [6]:
df.query("pos_tag.str.startswith('PP')")

In [7]:
junk = ['Foreign', 'Unknown', '>XCOMP', 'Symbol', 'Guess', '<MWE>', 'X', 'Punct']
# SG -> a LOT. PL -> a LOT.
df.query("pos_tag == 'Rel:ც'")

In [8]:
df.groupby(by='pos_tag').pos_tag.count()

In [9]:
main_pos = ['N', 'V', 'Adv', 'A', 'Num', 'Pron', 'Cj', 'Interj']

In [10]:
def convert(tag):
    try:
        return {"Nom":"N", "Dat":"N", "Gen":"N", "Erg":"N", "Voc":"N", "Inst":"N", "Foc":"Adv"}[tag]
    except KeyError:
        if tag in main_pos:
            return tag
        return 'Other'

df.pos_tag = df.pos_tag.agg(convert)

In [11]:
plt.figure(figsize=(10, 15))
sns.barplot(data=df.groupby(by="pos_tag")["word"].count().reset_index().sort_values(by="word", ascending=False), y="pos_tag", x="word")

## სიტყვის სიგრძეები

In [12]:
df['word_length'] = df.word.str.len()
df['lemma_length'] = df.lemma.str.len()
df['ratio'] = df['word_length'] / df['lemma_length']
df[['word', 'ratio', 'word_length', 'lemma_length']]

In [13]:
print(f'max: {df["lemma_length"].max()}, min: {df["lemma_length"].min()}')
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='lemma_length')

In [14]:
df.lemma_length.quantile(q=[ i / 100 for i in range(1, 100)])

In [15]:
df['lemma_length_category'] = pd.qcut(df.lemma_length, [0, 0.33, 0.67, 1], labels=['low', 'medium', 'high'])

In [16]:
print(f'max: {df["word_length"].max()}, min: {df["word_length"].min()}')
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='word_length')

In [17]:
df['word_length_category'] = pd.qcut(df.word_length, [0, 0.33, 0.67, 1], labels=['low', 'medium', 'high'])

In [18]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='word_length', color='red', alpha=0.7)
sns.countplot(data=df, x='lemma_length', color='blue', alpha=0.7)
plt.xlabel("")

In [19]:
print(f'max: {df["ratio"].max()}, min: {df["ratio"].min()}')
plt.figure(figsize=(10, 5))
sns.histplot(data=df[df.ratio.between(*list(df.ratio.quantile([.01, .9])))], x='ratio', binwidth=0.2, stat='count')

In [20]:
df['ratio'].quantile()

In [21]:
conditions = [
    df['ratio'] < 1,
    df['ratio'] == 1,
    df['ratio'] > 1
]
choices = ['less', 'equal', 'greater']
df['ratio_category'] = np.select(conditions, choices)

In [22]:
sns.countplot(df, x='ratio_category')

In [23]:
df.query('ratio_category == "greater"')

## სხვათა სიტყვათა ფორმები

In [24]:
reported_speech_candidates = df[df.word.str.endswith('ო')].copy()
reported_speech_candidates['word'] = reported_speech_candidates.word.str[:-1]
reported_speech_candidates = reported_speech_candidates[reported_speech_candidates.word.isin(df.word)]
third_person_reported_speech = df.index.isin(reported_speech_candidates[reported_speech_candidates.word.isin(pd.merge(reported_speech_candidates, df, on=['word', 'lemma', 'pos_tag'], how='inner').word)].index)

df['reported_speech'] = df.word.str.endswith(('-მეთქი', '-თქო')) | third_person_reported_speech

In [25]:
df['reported_speech'] = df['reported_speech'].astype('str')

In [26]:
df.groupby(by='reported_speech').word.count()

In [27]:
sns.countplot(data=df, x='reported_speech')

## სიხშირეები

In [28]:
df.freq.quantile(q=[ i / 100 for i in range(1, 100)])

In [29]:
df.query('freq > 100').shape[0]

In [30]:
df['freq_category'] = pd.qcut(df.freq, [0, 0.5, 0.9, 1], labels=['low', 'medium', 'high'])

In [31]:
df.groupby(by='freq_category').freq.mean()

In [32]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='freq_category')

In [166]:
df.head()

Unnamed: 0,word,lemma,pos_tag,freq,word_length,lemma_length,ratio,freq_category
0,ამ,ეს,Pron,1041654,2,2,1.0,high
1,ზამთრის,ზამთარი,N,8035,7,7,1.0,high
2,მიწურულს,მიწურული,Dat,5976,8,8,1.0,high
3,თითქმის,თითქმის,Adv,62608,7,7,1.0,high
4,სასოწარკვეთილი,სასოწარკვეთილი,A,844,14,14,1.0,high


## ტრეინ / ტესტ დაყოფა

In [295]:
classes = df[['pos_tag', 'ratio_category', 'word_length_category', 'freq_category']]

In [305]:
df.groupby(by=['pos_tag', 'ratio_category', 'word_length_category']).word.count()

pos_tag  ratio_category  word_length_category
A        equal           low                      16096
                         medium                   24906
                         high                     17211
         greater         low                      19072
                         medium                   51359
                                                  ...  
V        greater         medium                  154679
                         high                     80050
         less            low                      22896
                         medium                    8482
                         high                      1526
Name: word, Length: 81, dtype: int64

In [296]:
train_test_split(df, test_size=0.3, stratify=classes, random_state=42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [306]:
df.groupby(by=['pos_tag', 'ratio_category', 'word_length_category']).word.count()

pos_tag  ratio_category  word_length_category
A        equal           low                      16096
                         medium                   24906
                         high                     17211
         greater         low                      19072
                         medium                   51359
                                                  ...  
V        greater         medium                  154679
                         high                     80050
         less            low                      22896
                         medium                    8482
                         high                      1526
Name: word, Length: 81, dtype: int64

In [None]:
 # საშუალო სიხშირე