## Installation Of Gensim For Word2Vec

In [1]:
conda install -c gensim


Note: you may need to restart the kernel to use updated packages.


In [2]:
from gensim.models import Word2Vec,KeyedVectors

## Import Sklearn

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

## Importing Dataset

In [4]:
df= pd.read_csv('data_scraped6.csv')

## Trying For Combination

In [5]:
Combo = df["Body"]+df["Title"]+df["Comments"]
df = df.assign(Combo = Combo) 
df['Combo']=df['Combo'].fillna("")

In [6]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

## Taking the values from the dataset

In [7]:
Body=df['Combo'].values.tolist()

## Cleaning of the words by tokenising them and removing punctuations and stop words

In [8]:
newVec=list()
for line in Body:
    #create words token as well as remove punctuation in one go
    rem_tok_punc=RegexpTokenizer(r'\w+')
    tokens=rem_tok_punc.tokenize(line)
    
    #convert words to lower
    words=[w.lower() for w in tokens]
    #Invoke all english stopwords
    stop_word_list=list(stopwords.words('english'))
    stop_word_list.append('https')
    stop_word_list.append('www')
    stop_word_list.append('.com')
    #Remove Stop words
    words=[w for w in words if not w in stop_word_list]
    #Append words in the newVec
    newVec.append(words)
len(newVec)

4985

## Fitting Word2Vector Model 

In [9]:
model=Word2Vec(newVec,min_count=4,size=100,window=10,sg=1,iter=10)

## Number Of Words in Vocabulary

In [10]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(25212, 100)

## Finding Vector Of Each Word

In [11]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    ##print(index2word_set)
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [12]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [13]:
# Calculating average feature vector for training set
clean_train_reviews=df['Combo'].values
num_features=100
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 4985


  del sys.path[0]
  app.launch_new_instance()


Review 1000 of 4985
Review 2000 of 4985
Review 3000 of 4985
Review 4000 of 4985


In [14]:
trainDataVecs.shape

(4985, 100)

In [15]:
df.shape

(4985, 14)

In [16]:
data=trainDataVecs
traindata=pd.DataFrame(data=data[0:,0:])  # 1st row as the column names
traindata.shape

(4985, 100)

In [17]:
traindata=traindata.fillna(0)
traindata.describe

<bound method NDFrame.describe of             0         1         2         3         4         5         6   \
0     0.061371 -0.120607  0.040474 -0.097457  0.405365  0.384897 -0.113782   
1     0.053625 -0.018606  0.064007 -0.122914  0.405916  0.427729 -0.136103   
2     0.092077 -0.095275  0.069938 -0.124289  0.414719  0.444703 -0.125571   
3     0.122173 -0.080838  0.055664 -0.147828  0.411730  0.459858 -0.142195   
4     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
4980  0.088571 -0.111644  0.045078 -0.134630  0.419461  0.422749 -0.105639   
4981  0.104627 -0.076089  0.029455 -0.148830  0.415745  0.449461 -0.130220   
4982  0.083307 -0.051177 -0.015975 -0.149627  0.424199  0.505146 -0.120423   
4983  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4984  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

            7         8      

## Various Algorithms(LogisticRegression, KNN, CART,RandomForest,NaiveBayes) 

In [18]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR',LogisticRegression(max_iter=500,solver='lbfgs',multi_class='multinomial')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Rand_Forest', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append (('SDG', SGDClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
	cv_results = model_selection.cross_val_score(model, traindata,df["Flair"], cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.219661 (0.018247)
KNN: 0.212242 (0.022807)
CART: 0.520777 (0.027339)




Rand_Forest: 0.520175 (0.026658)
NB: 0.157878 (0.016715)
SDG: 0.175719 (0.029710)
