In [8]:
import pandas as pd
from googletrans import Translator

In [9]:
#import the dataset
df = pd.read_csv("./data/interviews/interviews.csv")
df.head(1)

Unnamed: 0.1,Unnamed: 0,file_name,label,text
0,0,BG_Box_AleksandarPovetkin_vs_AnthonyDjoshua_NO,NO,"Аз съм в много добра форма, проведох отличен т..."


In [10]:
df.columns

Index(['Unnamed: 0', 'file_name', 'label', 'text'], dtype='object')

## Translate the dataset in English

In [14]:
#instantiate the translator and detect the language
translator = Translator()
df['English'] = df['text'].apply(translator.translate, src='bg', dest='en').apply(getattr, args=('text',))

Somehow some lines are translated, but some others are not translated into English:

In [6]:
from ipyannotate import annotate
def display_record(record):
    display(record[1].drop(['Unnamed: 0', 'file_name', 'label', 'text']))

data = df.iterrows()


annotation = annotate(data, display=display_record)
ok, error, next, back = annotation.toolbar.buttons

In [21]:
display(annotation)

Annotation(canvas=OutputCanvas(outputs=({'output_type': 'display_data', 'data': {'text/plain': 'English    I a…

In [54]:
!mkdir -p data/interviews/manually_translated/bg
df['Manually_Labeled'] = [task.value for task in annotation.tasks]
for index, row in df.loc[df.Manually_Labeled == 0].iterrows():
    with open('data/interviews/manually_translated/bg/interview_'+ str(index) + '.txt', 'w') as f:
        f.write(row['English'])

In [59]:
#we translate those interviews manually and put them in this directory:
!mkdir -p data/interviews/manually_translated/en

#then we change in the dataframe
for index, row in df.loc[df.Manually_Labeled == 0].iterrows():
    with open('data/interviews/manually_translated/en/interview_'+ str(index) + '.txt', 'r') as f:
        df.loc[index, 'English'] = f.read()

In [64]:
df['text_bg'] = df.text
df['text'] = df.English
df.drop('English',1).to_csv("./data/interviews/interviews_en.csv")

## English Interviews Preprocessing

In [18]:
import pandas as pd
df = pd.read_csv("./data/interviews/interviews_en.csv")

In [19]:
df = df[['label','text']]
df.head(1)

Unnamed: 0,label,text
0,NO,"I am in very good shape, I had an excellent tr..."


In [20]:
import re, string, unicodedata
import nltk
# import contractions
# import inflect
# from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from stop_words import get_stop_words
from string import ascii_letters, digits, whitespace

import glob
import errno

### Let's clean the interviews from stop_words, puncuation etc...

In [21]:
def tokenize(text):
    words = nltk.word_tokenize(text)
    return words

In [22]:
def is_ascii(word):
    for c in word:
        if c in ascii_letters:
            return True
    return False

In [23]:
##you don't want to remove ascii for English, only for Bg...
# def remove_ascii(words):
#     """Remove ASCII characters from list of tokenized words"""
#     new_words = []
#     for word in words:
#         if not is_ascii(word):
#             new_words.append(word)
#     return new_words

In [24]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

In [25]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [26]:
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = 'число_' + str(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

In [27]:
def remove_numbers(words):
    """Remove all interger occurrences in list of tokenized words"""
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

In [28]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in get_stop_words('bg'):
            new_words.append(word)
    return new_words

In [29]:
def stem_words(words=None):
    """Stem words in list of tokenized words"""
    print("USE PRESLAV NAKOV's STEMMER !!!")

In [30]:
def remove_empty_words(words):
    new_words = []
    for word in words:
        if word.strip():
            new_words.append(word)
    return new_words

In [31]:
def print_words(df):
    for i, words in enumerate(df['words'], 1):
        print('Interview ' + str(i))
        print(words)

In [32]:
df['words'] = [tokenize(text) for text in df['text']]
#print_words(df)

df['words'] = [to_lowercase(words) for words in df['words']]
# print_words(df)

df['words'] = [remove_punctuation(words) for words in df['words']]
# print_words(df)

df['words'] = [remove_numbers(words) for words in df['words']]
# print_words(df)

df['words'] = [remove_stopwords(words) for words in df['words']]
# print_words(df)

df['words'] = [remove_empty_words(words) for words in df['words']]
# print_words(df)

#### Save all those cleaned interviews to files inside the word/ directory

In [29]:
!mkdir words

In [26]:
def words_to_file(words, index):
    outF = open("words/{0}.txt".format(index), "w")
    for i, word in enumerate(words, 1):
        outF.write(word)
        if i < len(words):
            outF.write("\n")
    outF.close()

In [27]:
def words_to_files(df):
    for row in df.itertuples():
        words_to_file(row.words, row.Index)

In [30]:
words_to_files(df)

#### Word stemming

In [49]:
#stemming explanation
#https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

In [38]:
#small example on first line
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer()
ls = LancasterStemmer()
  
# choose some words to be stemmed 
words = df.words[0]
  
for w in words: 
    print(w, " : ", ps.stem(w),ls.stem(w)) 


i  :  i i
am  :  am am
in  :  in in
very  :  veri very
good  :  good good
shape  :  shape shap
i  :  i i
had  :  had had
an  :  an an
excellent  :  excel excel
training  :  train train
camp  :  camp camp
joshua  :  joshua joshu
is  :  is is
one  :  one on
of  :  of of
the  :  the the
strongest  :  strongest strongest
heavyweight  :  heavyweight heavyweight
boxers  :  boxer box
in  :  in in
the  :  the the
world  :  world world
that  :  that that
s  :  s s
why  :  whi why
i  :  i i
m  :  m m
happy  :  happi happy
to  :  to to
have  :  have hav
the  :  the the
opportunity  :  opportun opportun
to  :  to to
be  :  be be
in  :  in in
the  :  the the
ring  :  ring ring
against  :  against against
him  :  him him
and  :  and and
i  :  i i
want  :  want want
to  :  to to
give  :  give giv
the  :  the the
fans  :  fan fan
a  :  a a
nice  :  nice nic
match  :  match match
this  :  thi thi
meeting  :  meet meet
will  :  will wil
show  :  show show
everything  :  everyth everyth
anthony  :  antho

In [39]:
df_stem = pd.DataFrame()
df_stem['words_stem_1'] = df.words.apply(lambda x : [ps.stem(word) for word in x])
df_stem['words_stem_2'] = df.words.apply(lambda x : [ls.stem(word) for word in x])
df_stem

Unnamed: 0,words_stem_1,words_stem_2
0,"[i, am, in, veri, good, shape, i, had, an, exc...","[i, am, in, very, good, shap, i, had, an, exce..."
1,"[everyon, underestim, me, but, it, will, be, a...","[everyon, underestim, me, but, it, wil, be, a,..."
2,"[leav, asid, the, techniqu, and, qualiti, we, ...","[leav, asid, the, techn, and, qual, we, both, ..."
3,"[i, take, the, match, with, andi, veri, seriou...","[i, tak, the, match, with, andy, very, sery, i..."
4,"[i, wa, push, to, place, i, had, never, been, ...","[i, was, push, to, plac, i, had, nev, been, pu..."
5,"[great, i, feel, welcom, to, romania, and, to,...","[gre, i, feel, welcom, to, roman, and, to, my,..."
6,"[grigor, is, a, seriou, oppon, i, m, sure, he,...","[grig, is, a, sery, oppon, i, m, sur, he, s, p..."
7,"[i, m, come, to, kill, thi, man, noth, happen,...","[i, m, com, to, kil, thi, man, noth, hap, the,..."
8,"[i, look, forward, to, the, battl, with, kubra...","[i, look, forward, to, the, battl, with, kubr,..."
9,"[my, last, match, wa, about, two, month, ago, ...","[my, last, match, was, about, two, month, ago,..."


#### Save all those cleaned interviews to files inside the word_stem_1 and  word_stem_2 directories

In [81]:
!mkdir words_stem_1
!mkdir words_stem_2

mkdir: cannot create directory ‘words_stem_1’: File exists


In [99]:
def words_stem_to_files(df,directory):
    for index, row in df_stem.iterrows():
        print(df.loc[index, directory])
        outF = open(directory + "/{0}.txt".format(index), "w")
        words = df.loc[index, directory]
        for i, word in enumerate(words, 1):
            outF.write(word)
            if i < len(words):
                outF.write("\n")
        outF.close()

In [101]:
words_stem_to_files(df_stem,"words_stem_1")
words_stem_to_files(df_stem,"words_stem_2")

['i', 'am', 'in', 'veri', 'good', 'shape', 'i', 'had', 'an', 'excel', 'train', 'camp', 'joshua', 'is', 'one', 'of', 'the', 'strongest', 'heavyweight', 'boxer', 'in', 'the', 'world', 'that', 's', 'whi', 'i', 'm', 'happi', 'to', 'have', 'the', 'opportun', 'to', 'be', 'in', 'the', 'ring', 'against', 'him', 'and', 'i', 'want', 'to', 'give', 'the', 'fan', 'a', 'nice', 'match', 'thi', 'meet', 'will', 'show', 'everyth', 'anthoni', 'is', 'a', 'strong', 'boxer', 'but', 'i', 'am', 'also', 'strong', 'when', 'i', 'fought', 'with', 'klitschko', 'i', 'wa', 'weaker', 'and', 'in', 'wors', 'shape', 'than', 'i', 'am', 'now']
['everyon', 'underestim', 'me', 'but', 'it', 'will', 'be', 'a', 'terribl', 'battl', 'everyon', 'will', 'see', 'what', 'i', 'can', 'do', 'i', 'am', 'here', 'to', 'shock', 'everyon', 'to', 'shock', 'the', 'world', 'and', 'to', 'show', 'who', 'i', 'am', 'i', 'm', 'readi', 'for', 'anthoni', 'joshua', 'i', 'hope', 'he', 'underestim', 'me', 'and', 'think', 'the', 'battl', 'with', 'me', 'w

#### Word embedding

[link with good general explanation about word embedding](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)

[better link for our case, explaining step by step code that is below this cell](https://www.tutorialspoint.com/gensim/gensim_creating_tf_idf_matrix.htm)

In [23]:
from gensim.corpora import Dictionary
from gensim.models import NormModel
from gensim.models import TfidfModel

In [24]:
def tf_idf(df, attr):
    documents = df[attr]
    dictionary = Dictionary(documents)
    n_items = len(dictionary)
    #docbow converts to bag of words
    corpus = [dictionary.doc2bow(text) for text in documents]
    #then we apply tfidf 
    tfidf = TfidfModel(corpus) #fit tfidf on this corpus
    corpus_tfidf = tfidf[corpus] #transform the corpus
    
    #then make a dataframe out of it
    ds = []
    for doc in corpus_tfidf:
        d = [0] * n_items
        for index, value in doc :
            d[index]  = value
        ds.append(d)
    df_tfidf = pd.DataFrame(ds)
    return df_tfidf

Let's see step by step what this function does:

In [80]:
attr = 'words_stem_1'
#documents = corpus of all interviews stemmed with one stemmer
documents = df_stem[attr]
#we make a dict out of it
dictionary = Dictionary(documents)
n_items = len(dictionary)
print("\nDescription of 'dictionary':\n")
print("type:\t\t",type(dictionary))
print("len:\t\t",n_items , "this is the number of different words in the whole stemmed dataset")
print("10 first keys:\t\t",dictionary.keys()[:10])
print("10 first values:\t\t",[dictionary[x] for x in dictionary.keys()[:10]])
print("\nThe dictionary is just all the tokens. Careful, it is not in alphabetical order, \ndespite what you might think from the first values!")
# print("values:",dictionary.values()[:10])
corpus = [dictionary.doc2bow(text) for text in documents]
print("\ndoc2bow(document) - Convert document (a list of words) into the bag-of-words format = \nlist of (token_id, token_count) 2-tuples.")
print("here is the count for the first document(index [0]) for the first 3 words of the dictionary (index [3])",corpus[0][:3])


Description of 'dictionary':

type:		 <class 'gensim.corpora.dictionary.Dictionary'>
len:		 1153 this is the number of different words in the whole stemmed dataset
10 first keys:		 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10 first values:		 ['a', 'against', 'also', 'am', 'an', 'and', 'anthoni', 'be', 'boxer', 'but']

The dictionary is just all the tokens. Careful, it is not in alphabetical order, 
despite what you might think from the first values!

doc2bow(document) - Convert document (a list of words) into the bag-of-words format = 
list of (token_id, token_count) 2-tuples.
here is the count for the first document(index [0]) for the first 3 words of the dictionary (index [3]) [(0, 2), (1, 1), (2, 1)]


In [26]:
#we apply the tfidf on each stemmer
df_tfidf_1 = tf_idf(df_stem, 'words_stem_1')
df_tfidf_2 = tf_idf(df_stem, 'words_stem_2')

The following lines is just to reformat the df_tfidf dataframes with column names

In [27]:
def get_headers(df, attr):
    documents = df[attr]
    dictionary = Dictionary(documents)
    return list(dictionary.values())

In [28]:
df_tfidf_headers_1 = get_headers(df_stem, 'words_stem_1')
df_tfidf_headers_2 = get_headers(df_stem, 'words_stem_2')

In [29]:
df_tfidf_1.columns = df_tfidf_headers_1
df_tfidf_2.columns = df_tfidf_headers_2

In [30]:
print(df_tfidf_1.shape)
df_tfidf_1.head()

(50, 1153)


Unnamed: 0,a,against,also,am,an,and,anthoni,be,boxer,but,...,knee,oper,optim,pleas,present,act,million,morocco,rain,slow
0,0.010795,0.077935,0.104181,0.15943,0.077935,0.01364,0.182116,0.016083,0.2981,0.035261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.009298,0.0,0.0,0.061035,0.0,0.027415,0.209159,0.055415,0.0,0.040497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004297,0.062051,0.0,0.0,0.0,0.01629,0.0,0.0,0.237344,0.028074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.005896,0.0,0.0,0.0,0.0,0.029801,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.005509,0.0,0.0,0.0,0.028481,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
print(df_tfidf_2.shape)
df_tfidf_2.head()

(50, 1068)


Unnamed: 0,a,against,also,am,an,and,anthony,be,box,but,...,psycholog,stronger,condit,kne,optim,pleas,pres,mil,morocco,rain
0,0.011233,0.081097,0.108408,0.165898,0.081097,0.014194,0.189505,0.016736,0.203976,0.036692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.009672,0.0,0.0,0.063485,0.0,0.028516,0.217558,0.05764,0.0,0.042123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004369,0.063082,0.0,0.0,0.0,0.016561,0.0,0.0,0.158665,0.028541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006091,0.0,0.0,0.0,0.0,0.030785,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.005688,0.0,0.0,0.0,0.029408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Save the result into csv files

In [34]:
!mkdir -p data/word2vec

In [35]:
file_name = 'data/word2vec/tfidf_stem_1.csv'
df_tfidf_1.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

In [37]:
file_name = 'data/word2vec/tfidf_stem_2.csv'
df_tfidf_2.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)