## Installation Of Gensim For Word2Vec

In [1]:
conda install -c gensim


Note: you may need to restart the kernel to use updated packages.


In [2]:
from gensim.models import Word2Vec,KeyedVectors

## Import Sklearn

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

## Importing Dataset

In [4]:
df= pd.read_csv('data_scraped6.csv')

## Trying For Combination

In [5]:
Combo = df["Body"]+df["Comments"]+df["Url_address"]
df = df.assign(Combo = Combo) 
df['Combo']=df['Combo'].fillna("")

In [6]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

## Taking the values from the dataset

In [7]:
Body=df['Combo'].values.tolist()

## Cleaning of the words by tokenising them and removing punctuations and stop words

In [8]:
newVec=list()
for line in Body:
    #create words token as well as remove punctuation in one go
    rem_tok_punc=RegexpTokenizer(r'\w+')
    tokens=rem_tok_punc.tokenize(line)
    
    #convert words to lower
    words=[w.lower() for w in tokens]
    #Invoke all english stopwords
    stop_word_list=list(stopwords.words('english'))
    stop_word_list.append('https')
    stop_word_list.append('www')
    stop_word_list.append('.com')
    #Remove Stop words
    words=[w for w in words if not w in stop_word_list]
    #Append words in the newVec
    newVec.append(words)
len(newVec)

4985

## Fitting Word2Vector Model 

In [9]:
model=Word2Vec(newVec,min_count=4,size=100,window=10,sg=1,iter=10)

## Number Of Words in Vocabulary

In [10]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(25310, 100)

## Finding Vector Of Each Word

In [11]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    ##print(index2word_set)
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [12]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [13]:
# Calculating average feature vector for training set
clean_train_reviews=df['Combo'].values
num_features=100
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 4985


  del sys.path[0]
  app.launch_new_instance()


Review 1000 of 4985
Review 2000 of 4985
Review 3000 of 4985
Review 4000 of 4985


In [14]:
trainDataVecs.shape

(4985, 100)

In [15]:
df.shape

(4985, 14)

In [16]:
data=trainDataVecs
traindata=pd.DataFrame(data=data[0:,0:])  # 1st row as the column names
traindata.shape

(4985, 100)

In [17]:
traindata=traindata.fillna(0)
traindata.describe

<bound method NDFrame.describe of             0         1         2         3         4         5         6   \
0    -0.055012 -0.366370  0.102873  0.373568  0.242634 -0.527186  0.207517   
1    -0.003632 -0.374905  0.126889  0.313974  0.207356 -0.463061  0.230835   
2    -0.084856 -0.373741  0.143059  0.371829  0.253490 -0.517322  0.213618   
3    -0.061201 -0.396706  0.143908  0.308653  0.215951 -0.493807  0.230340   
4     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
4980 -0.096852 -0.382932  0.123683  0.371584  0.246138 -0.496684  0.247807   
4981 -0.112067 -0.376202  0.160470  0.326131  0.261739 -0.482876  0.249438   
4982 -0.023664 -0.397993  0.130027  0.314917  0.206930 -0.477457  0.256991   
4983  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4984  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

            7         8      

## Various Algorithms(LogisticRegression, KNN, CART,RandomForest,NaiveBayes) 

In [18]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR',LogisticRegression(max_iter=500,solver='lbfgs',multi_class='multinomial')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Rand_Forest', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append (('SDG', SGDClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
	cv_results = model_selection.cross_val_score(model, traindata,df["Flair"], cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.214045 (0.020636)
KNN: 0.194993 (0.018713)
CART: 0.516766 (0.028161)




Rand_Forest: 0.521179 (0.026696)
NB: 0.157878 (0.016715)
SDG: 0.181940 (0.039887)
