In [1]:
import pandas as pd
import re

from nltk import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer 

import warnings
warnings.filterwarnings('ignore')

# !pip install gensim

from gensim.test.utils import common_texts
from gensim.models import Word2Vec



col = ['target', 'id', 'date', 'flag', 'user', 'text']

df = pd.read_csv(r'training.1600000.processed.noemoticon.csv', header = None, names = col,  encoding='latin-1')


df.dropna()

df = df.sample(n=50000)

# Handle Categories variable
df['target'] = df['target'].replace(4,1)


df['text'].head()

897764     has his facial mask on. Gotta be pretty!  What...
738779     when i heard Nick+Miley singing Before The Sto...
1226062    @easmart my name is rachel, i fail at maths &a...
655723                                    rinitis sucks!!!!!
35251      A bit frustrated - have used all day to find a...
Name: text, dtype: object

In [2]:

def get_part_of_day(h):
        if   5 <= h <= 11:
            return 'morning'
        elif 12 <= h <= 17:
            return 'afternoon'
        elif 18 <= h <= 22:
            return 'evening'
        else:
            return 'night'


day = []
month = []
dayInMonth = []
timeOfTweet = []
timePeriod = []
year = []

for d in df['date']:
    sub = d.split(' ')
    day.append(sub[0])
    dayInMonth.append(sub[2])
    timePeriod.append(get_part_of_day(int(sub[3][:2])))

    # time zones is only pdt
    # timeZone.append(sub[4])

    # year is only 2009
    # year.append(sub[5])



data = pd.DataFrame()

data['day'] = day
data['dayInMonth'] = dayInMonth
data['partOfDay'] = timePeriod

data = pd.get_dummies(data, columns=['day','partOfDay'])



In [3]:
data['target'] = df['target'].values
data['text'] = df['text'].values

df = data

data.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,partOfDay_morning,partOfDay_night,target,text
0,3,0,0,0,1,0,0,0,0,1,0,0,1,has his facial mask on. Gotta be pretty! What...
1,21,0,0,0,1,0,0,0,0,0,1,0,0,when i heard Nick+Miley singing Before The Sto...
2,1,0,1,0,0,0,0,0,0,0,1,0,1,"@easmart my name is rachel, i fail at maths &a..."
3,19,1,0,0,0,0,0,0,0,0,1,0,0,rinitis sucks!!!!!
4,20,0,1,0,0,0,0,0,0,0,1,0,0,A bit frustrated - have used all day to find a...


In [4]:
print("starting text clean")

# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()



ps = PorterStemmer()
sb = SnowballStemmer(language='english')


all_stopwords = stopwords.words('english')

corpus = []
corpus1 = []



for sen in df['text']:
    # remove hashtags
    sen = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", ' ', sen)
#     sen = re.sub('(@|#|&)+([a-zA-Z0-9_]+)', '', sen)
    # remove weird chars
    sen = re.sub('[^a-zA-z\'\"]+', ' ', sen)
    # remove urls
    sen = re.sub(r'\$\w*', '', sen)
    # remove old style retweet text "RT"
    sen = re.sub(r'^RT[\s]+', '', sen)
    # remove hyperlinks
    sen = re.sub(r'https?:\/\/.*[\r\n]*', '', sen)
    # only removing the hash # sign from the word
    sen = re.sub(r'#', '', sen)

    

    sen = sen.lower()
    sen = sen.split()

    


    
            # porter stemmer vs snowball stemmer
#     text = [ps.stem(word) for word in sen if not word in set(all_stopwords)]

        # lemmtization vs stemming    (word meaning vs stem)
#   text1 = [sb.stem(word) for word in sen if not word in set(all_stopwords)]
    text1 = [lemmatizer.lemmatize(word) for word in sen if not word in set(all_stopwords)]
    
    
#     text = ' '.join(text)
    text1 = ' '.join(text1)
    
    
#     corpus.append(text)
    corpus1.append(text1)



starting text clean


In [5]:
x = pd.DataFrame({'col':corpus1})
x.head()

Unnamed: 0,col
0,facial mask gotta pretty happening peep spell ...
1,heard nick miley singing storm felt cry
2,name rachel fail math amp amp day left
3,rinitis suck
4,bit frustrated used day find error server conf...


In [6]:
# !pip uninstall gensim --user

In [7]:
# !pip install gensim==3.8.3 --user

In [8]:
y = data['target'].values


from gensim.sklearn_api import W2VTransformer

x = x.to_numpy()

model = W2VTransformer(size=300,min_count=1)

# model = model.fit(x)


wordvecs = model.fit_transform(x)

In [9]:
x = wordvecs

In [10]:
x = pd.DataFrame(wordvecs)


df = pd.concat([df, x], axis=1, join='inner')
df = df.drop(['target','text'],axis=1)

df.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,...,290,291,292,293,294,295,296,297,298,299
0,3,0,0,0,1,0,0,0,0,1,...,-0.000462,-0.000787,-0.001238,-0.000109,0.000217,-0.001405,0.000747,0.000262,0.001075,0.000929
1,21,0,0,0,1,0,0,0,0,0,...,-0.001349,-0.00149,-0.00095,-0.000559,0.001228,-0.000647,-0.000773,-1.4e-05,0.001408,-0.000115
2,1,0,1,0,0,0,0,0,0,0,...,0.001417,0.000364,6e-05,-0.001293,0.001613,0.000218,0.000242,-0.001175,-0.001076,0.001147
3,19,1,0,0,0,0,0,0,0,0,...,0.000313,-0.000522,-0.001084,0.00151,0.000524,0.000767,-0.001117,0.000926,-0.000136,-0.000911
4,20,0,1,0,0,0,0,0,0,0,...,-6.2e-05,-0.000237,0.001641,0.00025,-0.000198,-0.001597,0.001364,0.000182,-0.001473,-0.000202


In [11]:
X_train, X_test,y_train, y_test = train_test_split(df, y, test_size=0.2, random_state = 42)
X_train1, X_test1,y_train1, y_test1 = train_test_split(x, y, test_size=0.2, random_state = 42)

In [12]:
# models

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier


DecTr = DecisionTreeClassifier()
AdaBoost = AdaBoostClassifier()
Knn = KNeighborsClassifier()

DecTr1 = DecisionTreeClassifier()
AdaBoost1 = AdaBoostClassifier()
Knn1 = KNeighborsClassifier()


In [13]:
print("fitting extended data")
DecTr.fit(X_train,y_train)
AdaBoost.fit(X_train,y_train)
Knn.fit(X_train,y_train)

fitting extended data


KNeighborsClassifier()

In [14]:
print("fitting only text")
DecTr1.fit(X_train1,y_train)
AdaBoost1.fit(X_train1,y_train)
Knn1.fit(X_train1,y_train)

fitting only text


KNeighborsClassifier()

In [15]:
print("predicting extended")
predictionsa = DecTr.predict(X_test)
predictionsb = AdaBoost.predict(X_test)
predictionsc = Knn.predict(X_test)

predicting extended


In [16]:
print("predicting original")
predictions1a = DecTr1.predict(X_test1)
predictions1b = AdaBoost1.predict(X_test1)
predictions1c = Knn1.predict(X_test1)

predicting original


In [17]:
print("Decision tree\n")

print("extended:")
print(confusion_matrix(y_test,predictionsa))
print(classification_report(y_test,predictionsa))
print(accuracy_score(y_test,predictionsa))

print("original:")
print(confusion_matrix(y_test,predictions1a))
print(classification_report(y_test,predictions1a))
print(accuracy_score(y_test,predictions1a))

Decision tree

extended:
[[3053 1998]
 [1932 3017]]
              precision    recall  f1-score   support

           0       0.61      0.60      0.61      5051
           1       0.60      0.61      0.61      4949

    accuracy                           0.61     10000
   macro avg       0.61      0.61      0.61     10000
weighted avg       0.61      0.61      0.61     10000

0.607
original:
[[2625 2426]
 [2346 2603]]
              precision    recall  f1-score   support

           0       0.53      0.52      0.52      5051
           1       0.52      0.53      0.52      4949

    accuracy                           0.52     10000
   macro avg       0.52      0.52      0.52     10000
weighted avg       0.52      0.52      0.52     10000

0.5228


In [18]:
print("Adaboost\n")

print("extended:")
print(confusion_matrix(y_test,predictionsb))
print(classification_report(y_test,predictionsb))
print(accuracy_score(y_test,predictionsb))

print("original:")
print(confusion_matrix(y_test,predictions1b))
print(classification_report(y_test,predictions1b))
print(accuracy_score(y_test,predictions1b))

Adaboost

extended:
[[2557 2494]
 [1284 3665]]
              precision    recall  f1-score   support

           0       0.67      0.51      0.58      5051
           1       0.60      0.74      0.66      4949

    accuracy                           0.62     10000
   macro avg       0.63      0.62      0.62     10000
weighted avg       0.63      0.62      0.62     10000

0.6222
original:
[[2387 2664]
 [2291 2658]]
              precision    recall  f1-score   support

           0       0.51      0.47      0.49      5051
           1       0.50      0.54      0.52      4949

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000

0.5045


In [19]:
print("KNN\n")

print("extended:")
print(confusion_matrix(y_test,predictionsc))
print(classification_report(y_test,predictionsc))
print(accuracy_score(y_test,predictionsc))

print("original:")
print(confusion_matrix(y_test,predictions1c))
print(classification_report(y_test,predictions1c))
print(accuracy_score(y_test,predictions1c))

KNN

extended:
[[2803 2248]
 [1539 3410]]
              precision    recall  f1-score   support

           0       0.65      0.55      0.60      5051
           1       0.60      0.69      0.64      4949

    accuracy                           0.62     10000
   macro avg       0.62      0.62      0.62     10000
weighted avg       0.62      0.62      0.62     10000

0.6213
original:
[[2582 2469]
 [2356 2593]]
              precision    recall  f1-score   support

           0       0.52      0.51      0.52      5051
           1       0.51      0.52      0.52      4949

    accuracy                           0.52     10000
   macro avg       0.52      0.52      0.52     10000
weighted avg       0.52      0.52      0.52     10000

0.5175
