In [None]:
import pandas as pd
import nltk
import string
from sklearn  import svm
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df = pd.read_csv("/content/tweets_suspect.csv")

In [None]:
df

Unnamed: 0,message,label
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",1
1,is upset that he can't update his Facebook by ...,1
2,@Kenichan I dived many times for the ball. Man...,1
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",1
...,...,...
59995,"Really wants to go and see 17 again, because Z...",1
59996,@krissa22 Thank you!,1
59997,dreaming of you,1
59998,@TheEllenShow I saw a clip online! good show!,1


In [None]:
text = "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"

def tokenizer(text: str):
  text = text.lower()
  words = nltk.word_tokenize(text)
  return [word for word in words if word not in string.punctuation]

In [None]:
df['message_words'] = df['message'].apply(lambda x: tokenizer(x))
features = df['message_words']
target = df['label']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=42)

In [None]:
df

Unnamed: 0,message,label,message_words
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",1,"[switchfoot, http, //twitpic.com/2y1zl, awww, ..."
1,is upset that he can't update his Facebook by ...,1,"[is, upset, that, he, ca, n't, update, his, fa..."
2,@Kenichan I dived many times for the ball. Man...,1,"[kenichan, i, dived, many, times, for, the, ba..."
3,my whole body feels itchy and like its on fire,0,"[my, whole, body, feels, itchy, and, like, its..."
4,"@nationwideclass no, it's not behaving at all....",1,"[nationwideclass, no, it, 's, not, behaving, a..."
...,...,...,...
59995,"Really wants to go and see 17 again, because Z...",1,"[really, wants, to, go, and, see, 17, again, b..."
59996,@krissa22 Thank you!,1,"[krissa22, thank, you]"
59997,dreaming of you,1,"[dreaming, of, you]"
59998,@TheEllenShow I saw a clip online! good show!,1,"[theellenshow, i, saw, a, clip, online, good, ..."


In [None]:
w2vec_model = Word2Vec(sentences = df['message_words'], size=300, window=5, min_count=1, workers=4)
w2vec_model.save("word2vec.model")

In [None]:
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:  
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        if counter%10000==0:
          print (counter)
    return reviewFeatureVecs

In [None]:
train_vectors = getAvgFeatureVecs(X_train, w2vec_model, 300)

  featureVec = np.add(featureVec,model[word])


10000
20000
30000
40000


In [None]:
forest = RandomForestClassifier(n_estimators = 100)   
forest = forest.fit(train_vectors, y_train)

In [None]:
test_vectors = getAvgFeatureVecs(X_test, w2vec_model, 300)

  featureVec = np.add(featureVec,model[word])


10000


In [None]:
predictions = forest.predict(test_vectors)

In [None]:
precision, recall, fscore, _ = score(y_test, predictions, pos_label= 1, average='binary')

In [None]:
print(f'Precision: {precision} / Recall: {recall} / Accuracy: {(predictions==y_test).sum() / len(predictions)}')

Precision: 0.9000417536534446 / Recall: 1.0 / Accuracy: 0.90025
