In [2]:
import nltk, os, glob
nltk.download('averaged_perceptron_tagger_ru')

[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     C:\Users\c1635922\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!


True

In [3]:
def posTag(string):

    # POS tag words.
    token = nltk.pos_tag(nltk.word_tokenize(string), lang='rus')
    return token

In [4]:
from nltk.stem import SnowballStemmer

# Sets stemmer to Russian
stemmer = SnowballStemmer("russian")

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        # Uses SnowBall stemmer as it is compatible with Russian.
        # This strips prefixes/suffixes to give a base word.
        stems.append(stemmer.stem(item))
    return stems

In [5]:
# Takes files and their content and adds them to a dictionary.
# K= file directory V= file content
def files_to_corpus():
    corpus = {}

    # Specifies directory
    txt_dir = '../txts'

    # Finds all text files in the directory
    txt_path = os.path.join(txt_dir, '*.txt')

    # Uses Glob with the txt_path variable to place all the txt file directories in a list
    file_list = glob.glob(txt_path)

    # Opens each file in file list, reads the file and adds the text to a dictionary.
    # The key is the file and the value is the text.
    for f in file_list:
        with open(f, 'r', encoding="utf-8") as fs:
                text = fs.read()
                corpus[f] = text
    return corpus

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer

import pandas as pd
import plotly.graph_objects as go

# Links used for understanding tfidf
# https://towardsdatascience.com/higher-accuracy-and-less-process-time-in-text-classification-with-lda-and-tf-idf-d2d949e344c3
# https://www.bogotobogo.com/python/NLTK/tf_idf_with_scikit-learn_NLTK.php
# https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

corpus = files_to_corpus()

# TfidfVect object with variables specified.
tfidf = TfidfVectorizer(tokenizer=tokenize, analyzer='word',
                        sublinear_tf=True, smooth_idf=True, use_idf=True)

# Forms a tfidf Matrix using the text from the files.
tfidf_matrix = tfidf.fit_transform(corpus.values())
feature_names = tfidf.get_feature_names()

#  
for i in range(len(corpus)):
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

# sum tfidf frequency of each term through documents
sums = tfidf_matrix.sum(axis=0)

# connecting term to its sum frequency
data = []
for col, term in enumerate(feature_names):
    if col > 1000 and len(term) > 3:
        data.append( (str(posTag(term)), sums[0,col] ))

In [7]:
df = pd.DataFrame(data, columns=['term','rank'])
ordering = df.sort_values('rank', ascending=True)

plt_data = [go.Bar(
            x= list(ordering['rank'].head(10)),
            y= list(ordering['term'].head(10)),
            orientation='h',
            marker=dict(
                color='rgba(50, 171, 96, 0.6)',
                line=dict(
                    color='rgba(50, 171, 96, 1.0)',
                    width=1),),)]

plt_layout = ({"title": "TFIDF - Russian Tweets 03/2020-06/2020",
                       "yaxis": {"title":"Terms"},
                       "xaxis": {"title":"IDF"},
                       "showlegend": False})

fig = go.Figure(data=plt_data, layout=plt_layout)

fig.show()

In [8]:
# Saves to CSV
compression_opts = dict(method='zip',
                        archive_name='out.csv')  

ordering.to_csv('out.zip', index=False,
          compression=compression_opts)

Read SCHEMA of json file

In [9]:
import pandas as pd

# Open json, print json schema
df_idf = pd.read_json("../russianDataSets/ru/2020-03-01.json",lines=True)

# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of questions,columns=",df_idf.shape)

Schema:

 quote_count                                int64
contributors                             float64
truncated                                   bool
text                                      object
is_quote_status                             bool
in_reply_to_status_id                    float64
reply_count                                int64
id                                         int64
favorite_count                             int64
created_at_src                            object
collected_by                              object
retweeted                                   bool
investigationId                            int64
coordinates                               object
timestamp_ms                      datetime64[ns]
entities                                  object
in_reply_to_screen_name                   object
id_str                                     int64
text_translation                          object
retweet_count                              int64
stored_at 

In [10]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    if text[2] == "rt":
        text=""
    #remove tags
    text=re.sub("","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("../txts/2020-03-01.txt")

#get the text column 
docs=df_idf['text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
word_count_vector=cv.fit_transform(docs)
list(cv.vocabulary_.keys())[:10]


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['12', '13', '1400', '1595', '1600', '2008', '22', '2выздоровевшие', 'авто', 'астроном', 'блокчейн', 'вице', 'военно', 'военной', 'вторых', 'гей', 'демократа', 'док', 'доказательство', 'другому', 'душит', 'дь', 'европейска', 'женщину', 'журналистике', 'заболевшие', 'заек', 'захватчк', 'избавление', 'израиль', 'квартиры', 'коллега', 'коммунизм', 'крови', 'летняя', 'лужаек', 'марино', 'морских', 'морской', 'наркомана', 'недоразумение', 'нибудь', 'нон', 'ов', 'онлайн', 'оружии', 'отдохни', 'пекински', 'первых', 'пид', 'погибшие', 'подруга', 'порно', 'прежнему', 'премьер', 'прерогатива', 'пресс', 'причинно', 'пропаганда', 'рмот', 'салоном', 'сан', 'сканворды', 'следственных', 'службы', 'спаса', 'стопом', 'су', 'тату', 'тихо', 'толку', 'укро', 'чудо', 'чью', 'штаб', 'штирлицно', 'эксперта'] not in stop_words.



['rt',
 'shining_stri',
 'coldabbess',
 'тёмная',
 'даёт',
 'плю',
 'https',
 'co',
 'oubahqve',
 'vovablya']

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [13]:
# read test docs into a dataframe and concatenate title and body
df_test=pd.read_json("../russianDataSets/ru/2020-03-02.json",lines=True)
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['text'].tolist()