In [1]:
import pandas as pd
import numpy as np 
import itertools as it
import matplotlib.pyplot as plt

In [2]:
import pickle
import time

In [3]:
from gensim.models import Word2Vec



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [5]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Setup nltk corpora path and Google Word2Vec location
google_vec_file = r"C:\Users\moham\Metis Bootcamp\GoogleNewsVectors\GoogleNews-vectors-negative300.bin"

model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [6]:
#Open Corpus of News Article Text
with open('./data/news_data_frame_reduced_preprocessed.pickle', 'rb') as file:
    news_df = pickle.load(file)

In [7]:
fileObject = open("./data/grams/token_unigram_text",'rb')  
uni_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [8]:
fileObject = open("./data/grams/bigram_text",'rb')  
bi_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [9]:
fileObject = open("./data/grams/trigram_text",'rb')  
tri_lem_comb2 = pickle.load(fileObject)  ## load unigram

In [10]:
uni_lem = pd.Series([x for x in uni_lem_comb2])
uni_lem[0:10]

0    [ban, united, states, companies, selling, part...
1    [washington, hatice, cengiz, fiancee, murdered...
2    [least, six, civilians, including, women, chil...
3    [monday, may, photo, juliet, fine, principal, ...
4    [file, feb, file, photo, sen, doug, jones, ala...
5    [new, york, mayor, bill, blasio, arrives, offi...
6    [file, feb, file, photo, shows, oxycontin, pil...
7    [booking, photo, provided, chicago, police, de...
8    [file, friday, april, file, photo, far, right,...
9    [venezuela, opposition, leader, self, proclaim...
dtype: object

In [11]:
bi_lem = pd.Series([x for x in bi_lem_comb2])
bi_lem[0:10]

0    ban united_states companies selling parts huaw...
1    washington hatice cengiz fiancee murdered saud...
2    least_six civilians including women_children k...
3    monday may photo juliet fine principal beverly...
4    file_feb file_photo sen doug jones ala questio...
5    new_york mayor_bill blasio arrives official de...
6    file_feb file_photo shows oxycontin pills arra...
7    booking photo provided chicago_police departme...
8    file friday april file_photo far_right vox par...
9    venezuela opposition_leader self_proclaimed in...
dtype: object

In [12]:
tri_lem = pd.Series([x for x in tri_lem_comb2])
tri_lem[0:10]

0    ban united_states companies selling parts huaw...
1    washington hatice cengiz fiancee murdered saud...
2    least_six civilians including women_children k...
3    monday may_photo juliet fine principal beverly...
4    file_feb_file_photo sen doug jones ala questio...
5    new_york mayor_bill blasio arrives official de...
6    file_feb_file_photo shows oxycontin pills arra...
7    booking photo provided chicago_police departme...
8    file friday april file_photo far_right vox par...
9    venezuela opposition_leader self_proclaimed in...
dtype: object

#### Converting the Text into Trigram Vectors

In [13]:
def infer_vector(text):
    text = text.split()
    vector = []
    for i in text:
        try:
            vector.append(model.word_vec(i))
        except:
            pass
    return vector

In [14]:
vec_text_tri = tri_lem.apply(infer_vector)
values_of_errors = [i for i,x in enumerate(vec_text_tri) if x == []]
bad_series = vec_text_tri.index.isin(values_of_errors)
vector = vec_text_tri[~bad_series]

In [15]:
%%time
vec_text_tri = []
for row in vector:
    vec_text_tri.append(np.mean(row,axis=0))

vec_text_tri = pd.Series(vec_text_tri)

Wall time: 1.08 s


In [16]:
vec_text_tri[0:2]

0    [-0.0042810584, 0.039145432, 0.03362472, 0.065...
1    [0.010554764, 0.029965691, 0.04417438, 0.03513...
dtype: object

In [17]:
vec_text_tri = pd.DataFrame(vec_text_tri)

In [18]:
%%time
vec_text_tri = pd.DataFrame(vec_text_tri[0].values.tolist())

Wall time: 272 ms


In [19]:
vec_text_tri.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.004281,0.039145,0.033625,0.065445,-0.082601,-0.020334,0.024655,-0.027403,0.085103,0.043258,...,-0.058678,0.027957,-0.056587,0.032648,0.010886,0.003896,0.010895,-0.035646,0.039916,-0.020221
1,0.010555,0.029966,0.044174,0.035132,-0.061126,-0.015194,0.055819,-0.077717,0.104377,0.079149,...,-0.052899,-0.01986,-0.042446,0.034073,-0.031552,0.005538,-0.005109,-0.050903,0.037147,0.021626
2,0.027087,0.051287,0.044152,0.043642,-0.024894,-0.021078,-0.016652,-0.107409,0.101749,0.078397,...,-0.047129,-0.00474,-0.067274,0.012861,-0.05264,0.006913,-0.011003,-0.004494,0.03406,0.02155
3,0.027874,0.032129,0.046228,0.071021,-0.049118,-0.00941,0.038559,-0.073924,0.061498,0.028121,...,-0.088085,0.001812,-0.075012,0.036264,0.000982,-0.016414,0.001484,-0.056618,0.046487,-0.002936
4,0.016836,0.02173,0.039434,0.083412,-0.06415,-0.040388,0.047837,-0.032623,0.076677,0.04459,...,-0.049103,-0.006976,-0.034393,-0.010264,-0.040613,-0.00499,0.00394,-0.048302,0.036808,0.033324


In [20]:
# saving vec_text of trigrams:
filename = './data/grams/vec_text_trigram.sav'
pickle.dump(vec_text_tri, open(filename, 'wb'))

In [21]:
# Convert text and response to array 
y_response_tri = news_df.Not_Real_or_Real

In [23]:
bad_series_news_df = y_response_tri.index.isin(values_of_errors)
y_response_tri = y_response_tri[~bad_series_news_df].values

In [38]:
# saving y_response trigrams:
filename = './data/grams/y_response_tri.sav'
pickle.dump(y_response_tri, open(filename, 'wb'))

#### Converting the Text into Bigram Vectors

In [25]:
vec_text_bi = bi_lem .apply(infer_vector)
values_of_errors = [i for i,x in enumerate(vec_text_bi) if x == []]
bad_series = vec_text_bi.index.isin(values_of_errors)
vector = vec_text_bi[~bad_series]

In [26]:
%%time
vec_text_bi = []
for row in vector:
    vec_text_bi.append(np.mean(row,axis=0))

vec_text_bi = pd.Series(vec_text_bi)

Wall time: 1.11 s


In [27]:
vec_text_bi[0:2]

0    [-0.004424307, 0.039920203, 0.03196768, 0.0680...
1    [0.010511921, 0.030049566, 0.04401093, 0.03551...
dtype: object

In [28]:
vec_text_bi = pd.DataFrame(vec_text_bi)

In [29]:
%%time
vec_text_bi = pd.DataFrame(vec_text_bi[0].values.tolist())

Wall time: 280 ms


In [30]:
vec_text_bi.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.004424,0.03992,0.031968,0.068061,-0.080996,-0.024467,0.022741,-0.03074,0.084796,0.042647,...,-0.058602,0.02556,-0.059898,0.032545,0.013791,0.002109,0.007907,-0.033907,0.037345,-0.019121
1,0.010512,0.03005,0.044011,0.035513,-0.062077,-0.015621,0.057381,-0.077247,0.105612,0.081105,...,-0.053423,-0.019029,-0.044617,0.034856,-0.032338,0.003783,-0.003092,-0.052623,0.038148,0.021211
2,0.024445,0.049508,0.04212,0.041442,-0.028839,-0.022661,-0.016139,-0.106955,0.10106,0.076055,...,-0.050624,-0.003738,-0.064846,0.011221,-0.051285,0.007612,-0.012321,-0.007127,0.033073,0.027607
3,0.027942,0.033355,0.044223,0.070619,-0.048846,-0.008677,0.039634,-0.073948,0.062209,0.026507,...,-0.089344,0.001562,-0.075675,0.033858,-6.2e-05,-0.01496,0.002777,-0.05762,0.044924,-0.002511
4,0.016295,0.022985,0.037451,0.082624,-0.068564,-0.043832,0.04392,-0.029424,0.080041,0.042513,...,-0.052999,-0.012819,-0.035802,-0.015378,-0.039352,-0.005778,0.002291,-0.044876,0.036111,0.036912


In [31]:
# saving vec_text of trigrams:
filename = './data/grams/vec_text_bigram.sav'
pickle.dump(vec_text_bi, open(filename, 'wb'))

In [32]:
# Convert text and response to array 
y_response_bi = news_df.Not_Real_or_Real

In [33]:
bad_series_news_df = y_response_bi.index.isin(values_of_errors)
y_response_bi = y_response_bi[~bad_series_news_df].values

In [37]:
# saving y_response trigrams:
filename = './data/grams/y_response_bi.sav'
pickle.dump(y_response_bi, open(filename, 'wb'))

## Reload Data

In [None]:
#Open Corpus of News Article Text
with open('./data/news_data_frame_reduced_preprocessed.pickle', 'rb') as file:
    news_df = pickle.load(file)

In [12]:
fileObject = open("./data/grams/vec_text_trigram.sav",'rb')  
vec_text_trigram= pickle.load(fileObject)  ## load unigram

In [15]:
fileObject = open("./data/grams/y_response_tri.sav",'rb')  
y_response_tri= pickle.load(fileObject)  ## load unigram

In [16]:
fileObject = open("./data/grams/vec_text_bigram.sav",'rb')  
vec_text_bigram = pickle.load(fileObject)  ## load unigram

In [17]:
fileObject = open("./data/grams/y_response_bi.sav",'rb')  
y_response_bi = pickle.load(fileObject)  ## load unigram