In [1]:
import pandas as pd
import numpy as np 
import itertools as it
import matplotlib.pyplot as plt

In [2]:
import pickle
import time

In [3]:
from gensim.models import Word2Vec



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [5]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Setup nltk corpora path and Google Word2Vec location
google_vec_file = r"C:\Users\moham\Metis Bootcamp\GoogleNewsVectors\GoogleNews-vectors-negative300.bin"

model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [6]:
#Open Corpus of News Article Text
with open('./data/news_df.pickle', 'rb') as file:
    news_df = pickle.load(file)

In [7]:
fileObject = open("./data/grams/token_unigram_text",'rb')  
uni_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [8]:
fileObject = open("./data/grams/bigram_text",'rb')  
bi_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [9]:
fileObject = open("./data/grams/trigram_text",'rb')  
tri_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [10]:
uni_lem = pd.Series([x for x in uni_lem_comb2])
uni_lem[0:10]

0    [democrats, united, states, house, representat...
1    [british, steel, ordered, compulsory, liquidat...
2    [iranian, foreign, minister, javad, zarif, war...
3    [muslim, women, walk, near, burnt, car, jakart...
4    [big, question, eu, vote, well, far, right, fi...
5    [file, feb, file, photo, virginia, gov, ralph,...
6    [file, april, file, photo, conductor, zubin, m...
7    [file, feb, file, photo, boston, bruins, goali...
8    [speaker, house, nancy, pelosi, calif, speaks,...
9    [file, wednesday, jan, file, photo, prize, win...
dtype: object

In [11]:
bi_lem = pd.Series([x for x in bi_lem_comb2])
bi_lem[0:10]

0    democrats united_states house_representatives ...
1    british_steel ordered compulsory liquidation t...
2    iranian_foreign minister_javad zarif warned un...
3    muslim women walk near burnt car jakarta indon...
4    big question eu vote well far_right file satur...
5    file_feb file_photo virginia_gov ralph_northam...
6    file april file_photo conductor zubin mehta is...
7    file_feb file_photo boston bruins goalie tuukk...
8    speaker_house nancy_pelosi calif speaks report...
9    file wednesday jan file_photo prize_winning ke...
dtype: object

In [12]:
tri_lem = pd.Series([x for x in tri_lem_comb2])
tri_lem[0:10]

0    democrats united_states house_representatives ...
1    british_steel ordered compulsory liquidation t...
2    iranian_foreign minister_javad_zarif warned un...
3    muslim_women walk near burnt car jakarta indon...
4    big_question eu vote well far_right file satur...
5    file_feb_file_photo virginia_gov_ralph_northam...
6    file april file_photo conductor zubin mehta is...
7    file_feb_file_photo boston bruins goalie tuukk...
8    speaker_house nancy_pelosi_calif speaks report...
9    file wednesday jan file_photo prize_winning ke...
dtype: object

#### Converting the Text into Vectors

In [13]:
def infer_vector(text):
    text = text.split()
    vector = []
    for i in text:
        try:
            vector.append(model.word_vec(i))
        except:
            pass
    return vector

In [14]:
vec_text = tri_lem.apply(infer_vector)
values_of_erros = [i for i,x in enumerate(vec_text) if x == []]
bad_series = vec_text.index.isin(values_of_erros)
vector = vec_text[~bad_series]

In [15]:
%%time
vec_text = []
for row in vector:
    vec_text.append(np.mean(row,axis=0))

vec_text = pd.Series(vec_text)

Wall time: 2.74 s


In [16]:
vec_text[0:2]

0    [0.012582379, 0.04536337, 0.031114908, 0.07705...
1    [0.02133775, 0.026488008, 0.02957555, 0.040369...
dtype: object

In [17]:
vec_text = pd.DataFrame(vec_text)

In [18]:
%%time
vec_text = pd.DataFrame(vec_text[0].values.tolist())

Wall time: 985 ms


In [19]:
vec_text.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.012582,0.045363,0.031115,0.07706,-0.068374,-0.03383,0.030153,-0.039497,0.089389,0.031833,...,-0.049665,-0.042488,-0.035812,0.031589,-0.023756,-0.045757,0.021921,-0.053957,0.020599,0.049864
1,0.021338,0.026488,0.029576,0.04037,-0.034419,-0.037616,0.039927,-0.050052,0.116378,0.054397,...,-0.113467,0.041973,-0.088883,0.037254,0.009839,0.026207,-0.000665,-0.000213,0.052724,-0.062033
2,0.016563,0.059617,0.066829,0.081169,-0.068136,-0.002744,0.003846,-0.116771,0.074068,0.07807,...,-0.038157,0.020842,-0.050216,0.033304,-0.03191,-0.003213,-0.009037,-0.034112,0.037966,0.025232
3,0.057837,0.039482,0.055657,0.062033,-0.030582,-0.040403,-0.026342,-0.080705,0.07001,0.042793,...,-0.080221,-0.039261,-0.069008,-0.004207,-0.021162,0.000116,-0.021799,-0.055461,0.036999,0.04128
4,0.027191,0.049305,0.038407,0.109705,-0.074799,-0.051456,-0.017526,-0.099109,0.078649,0.0675,...,-0.059277,0.00843,-0.063211,0.023837,-0.032217,0.002827,-0.00215,-0.019368,0.041907,0.026303


In [20]:
# saving vec_text of trigrams:
filename = './data/grams/vec_text_trigram.sav'
pickle.dump(vec_text, open(filename, 'wb'))

In [21]:
# Convert text and response to array 
y_response = news_df.Not_Real_or_Real

In [22]:
bad_series_news_df = y_response.index.isin(values_of_erros)
y_response = y_response[~bad_series_news_df].values

In [60]:
# saving y_response trigrams:
filename = './data/grams/y_response.sav'
pickle.dump(y_response, open(filename, 'wb'))