In [1]:
import pandas as pd
import re

from nltk import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

import warnings
warnings.filterwarnings('ignore')


col = ['target', 'id', 'date', 'flag', 'user', 'text']

df = pd.read_csv('training.1600000.processed.noemoticon.csv', header = None, names = col,  encoding='latin-1')

# remove empty
df.dropna()
df.fillna(value='', inplace=True)


# df = df[['text','target']]
df = df.sample(n=50000)

# Handle Categories variable
df['target'] = df['target'].replace(4,1)


df['text'].head()

866949     @jose_castro not bad at all, it has some very ...
253257     Went to Carson City to find shoes fir spinning...
438837     So last night started with Saki Bombs and Sush...
1010479    Traded with @zazabronkhorst this morning  3 of...
48098      awww, boo, missed Young Dracula cos I was slee...
Name: text, dtype: object

In [2]:

def get_part_of_day(h):
        if   5 <= h <= 11:
            return 'morning'
            # return '0'
        elif 12 <= h <= 17:
            return "afternoon"
            # return "1"
        elif 18 <= h <= 22:
            return 'evening'
            # return '2'
        else:
            return "night"
            # return "3"


day = []
month = []
dayInMonth = []
timeOfTweet = []
timePeriod = []
year = []

for d in df['date']:
    sub = d.split(' ')
    # day.append(dayInMonth(sub[0]))
    day.append(sub[0])
    # month.append(sub[1])
    dayInMonth.append(sub[2])
    # timeOfTweet.append(sub[3])
    timePeriod.append(get_part_of_day(int(sub[3][:2])))

    # time zones is only pdt
    # timeZone.append(sub[4])

    # year is only 2009
    # year.append(sub[5])



data = pd.DataFrame()

data['day'] = day
# data['month'] = month
data['dayInMonth'] = dayInMonth
# data['time'] = time
data['partOfDay'] = timePeriod

data = pd.get_dummies(data, columns=['day','partOfDay'])



In [3]:
data['target'] = df['target'].values
data['text'] = df['text'].values

df = data

data.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,partOfDay_morning,partOfDay_night,target,text
0,2,0,0,1,0,0,0,0,0,0,0,1,1,"@jose_castro not bad at all, it has some very ..."
1,31,0,0,0,1,0,0,0,1,0,0,0,0,Went to Carson City to find shoes fir spinning...
2,7,0,0,0,1,0,0,0,0,0,1,0,0,So last night started with Saki Bombs and Sush...
3,22,1,0,0,0,0,0,0,0,0,0,1,1,Traded with @zazabronkhorst this morning 3 of...
4,2,0,0,1,0,0,0,0,0,0,0,1,0,"awww, boo, missed Young Dracula cos I was slee..."


In [4]:
print("starting text clean")


!pip install langdetect
from langdetect import detect

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

ps = PorterStemmer()
sb = SnowballStemmer(language='english')


import nltk
nltk.download('stopwords')


all_stopwords = stopwords.words('english')

corpus = []
corpus1 = []



for sen in df['text']:
    # remove hashtags
    sen = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", ' ', sen)
#     sen = re.sub('(@|#|&)+([a-zA-Z0-9_]+)', '', sen)
    # remove weird chars
    sen = re.sub('[^a-zA-z\'\"]+', ' ', sen)
    # remove urls
    sen = re.sub(r'\$\w*', '', sen)
    # remove old style retweet text "RT"
    sen = re.sub(r'^RT[\s]+', '', sen)
    # remove hyperlinks
    sen = re.sub(r'https?:\/\/.*[\r\n]*', '', sen)
    # only removing the hash # sign from the word
    sen = re.sub(r'#', '', sen)

    if sen == "" or sen.isspace():
            continue


    if detect(sen) != 'en':
            continue

    sen = sen.lower()
    sen = sen.split()
    
    
    text1 = [lemmatizer.lemmatize(word) for word in sen if not word in set(all_stopwords)]
#     text1 = [sb.stem(word) for word in sen if not word in set(all_stopwords)]
    
#     text = ' '.join(sen)
    text1 = ' '.join(text1)
    
    
#     corpus.append(text)
    corpus1.append(text1)


starting text clean


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Naor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus1)

In [6]:
print(type(vector))
words = pd.DataFrame.sparse.from_spmatrix(vector)
print(type(words))
df = pd.concat([df, words], axis=1, join='inner')


df = df.drop('text',axis=1)

df[df.columns[15:]] = df.iloc[:, 15:].astype('float16')


<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>


In [7]:
y = df.target
X = df.drop('target', axis=1)
X1 = pd.DataFrame.sparse.from_spmatrix(vector)

X.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,...,29633,29634,29635,29636,29637,29638,29639,29640,29641,29642
0,2,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,31,0,0,0,1,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29633,29634,29635,29636,29637,29638,29639,29640,29641,29642
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##  Only text vectorization

In [9]:
print("starting splitting data")

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state = 42)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y,test_size = 0.3, random_state = 42)


starting splitting data


In [10]:
# models
# !pip install sklearn.naive_baye
# !pip install sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

DecTr = DecisionTreeClassifier()
AdaBoost = AdaBoostClassifier()
Knn = KNeighborsClassifier()
# svm = SVC(gamma='auto')
mlp = MLPClassifier()
# gpc = GaussianProcessClassifier()
# gnb = GaussianNB()
rfc = RandomForestClassifier()
bc = BaggingClassifier()
lr = LogisticRegression()

DecTr1 = DecisionTreeClassifier()
AdaBoost1 = AdaBoostClassifier()
Knn1 = KNeighborsClassifier()
svm1 = SVC(gamma='auto')
mlp1 = MLPClassifier()
gpc1 = GaussianProcessClassifier()
gnb1 = GaussianNB()
rfc1 = RandomForestClassifier()
bc1 = BaggingClassifier()
lr1 = LogisticRegression()

In [11]:
print("fitting extended data")
DecTr.fit(X_train,y_train)
AdaBoost.fit(X_train,y_train)
Knn.fit(X_train,y_train)
# svm.fit(X_train,y_train)
mlp.fit(X_train,y_train)
# gpc.fit(X_train,y_train)
# gnb.fit(X_train,y_train)
rfc.fit(X_train,y_train)
bc.fit(X_train,y_train)
lr.fit(X_train,y_train)

fitting extended data


MemoryError: Unable to allocate 3.55 GiB for an array with shape (32139, 29655) and data type float32

In [None]:

# svm1 = SVC(probability=True)
print("fitting only text")
DecTr1.fit(X_train1,y_train1)
AdaBoost1.fit(X_train1,y_train1)
Knn1.fit(X_train1,y_train1)
X_train1Numpy = vector.toarray()
# svm1.fit(X_train1Numpy,y_train1)
mlp1.fit(X_train1,y_train1)
# gpc1.fit(X_train1Numpy,y_train1)
# gnb1.fit(X_train1Numpy,y_train1)
rfc1.fit(X_train1,y_train1)
bc1.fit(X_train1,y_train1)
lr1.fit(X_train1,y_train1)

In [None]:
print("predicting extended")
predictionsa = DecTr.predict(X_test)
predictionsb = AdaBoost.predict(X_test)
predictionsc = Knn.predict(X_test)
# predictionsd = svm.predict(X_test)
predictionse = mlp.predict(X_test)
# predictionsf = gpc.predict(X_test)
# predictionsg = gnb.predict(X_test)
predictionsh = rfc.predict(X_test)
predictionsi = bc.predict(X_test)
predictionsj = lr.predict(X_test)

In [None]:
print("predicting original")
predictions1a = DecTr1.predict(X_test1)
predictions1b = AdaBoost1.predict(X_test1)
predictions1c = Knn1.predict(X_test1)
# predictions1d = svm1.predict(X_test1)
predictions1e = mlp1.predict(X_test1)
# predictions1f = gpc1.predict(X_test1)
# predictions1g = gnb1.predict(X_test1)
predictions1h = rfc1.predict(X_test1)
predictions1i = bc1.predict(X_test1)
predictions1j = lr1.predict(X_test1)

In [None]:
print("Decision tree\n")

print("extended:")
print(confusion_matrix(y_test,predictionsa))
print(classification_report(y_test,predictionsa))
print(accuracy_score(y_test,predictionsa))

print("original:")
print(confusion_matrix(y_test1,predictions1a))
print(classification_report(y_test1,predictions1a))
print(accuracy_score(y_test1,predictions1a))

In [None]:
print("Adaboost\n")

print("extended:")
print(confusion_matrix(y_test,predictionsb))
print(classification_report(y_test,predictionsb))
print(accuracy_score(y_test,predictionsb))

print("original:")
print(confusion_matrix(y_test1,predictions1b))
print(classification_report(y_test1,predictions1b))
print(accuracy_score(y_test1,predictions1b))

In [None]:
print("KNN\n")

print("extended:")
print(confusion_matrix(y_test,predictionsc))
print(classification_report(y_test,predictionsc))
print(accuracy_score(y_test,predictionsc))

print("original:")
print(confusion_matrix(y_test1,predictions1c))
print(classification_report(y_test1,predictions1c))
print(accuracy_score(y_test1,predictions1c))

In [None]:
# print("SVM\n")

# print("extended:")
# print(confusion_matrix(y_test,predictionsd))
# print(classification_report(y_test,predictionsd))
# print(accuracy_score(y_test,predictionsd))

# print("original:")
# print(confusion_matrix(y_test1,predictions1d))
# print(classification_report(y_test1,predictions1d))
# print(accuracy_score(y_test1,predictions1d))

In [None]:
print("MLP\n")

print("extended:")
print(confusion_matrix(y_test,predictionse))
print(classification_report(y_test,predictionse))
print(accuracy_score(y_test,predictionse))

print("original:")
print(confusion_matrix(y_test1,predictions1e))
print(classification_report(y_test1,predictions1e))
print(accuracy_score(y_test1,predictions1e))

In [None]:
# print("Gaussian Process\n")

# print("extended:")
# print(confusion_matrix(y_test,predictionsf))
# print(classification_report(y_test,predictionsf))
# print(accuracy_score(y_test,predictionsf))

# print("original:")
# print(confusion_matrix(y_test1,predictions1f))
# print(classification_report(y_test1,predictions1f))
# print(accuracy_score(y_test1,predictions1f))

In [None]:
# print("Gaussian NB\n")

# print("extended:")
# print(confusion_matrix(y_test,predictionsg))
# print(classification_report(y_test,predictionsg))
# print(accuracy_score(y_test,predictionsg))

# print("original:")
# print(confusion_matrix(y_test1,predictions1g))
# print(classification_report(y_test1,predictions1g))
# print(accuracy_score(y_test1,predictions1g))


In [None]:
print("Random Forest\n")

print("extended:")
print(confusion_matrix(y_test,predictionsh))
print(classification_report(y_test,predictionsh))
print(accuracy_score(y_test,predictionsh))

print("original:")
print(confusion_matrix(y_test1,predictions1h))
print(classification_report(y_test1,predictions1h))
print(accuracy_score(y_test1,predictions1h))


In [None]:
print("Bagging\n")

print("extended:")
print(confusion_matrix(y_test,predictionsi))
print(classification_report(y_test,predictionsi))
print(accuracy_score(y_test,predictionsi))

print("original:")
print(confusion_matrix(y_test1,predictions1i))
print(classification_report(y_test1,predictions1i))
print(accuracy_score(y_test1,predictions1i))

In [None]:
print("Logistic Regression\n")

print("extended:")
print(confusion_matrix(y_test,predictionsj))
print(classification_report(y_test,predictionsj))
print(accuracy_score(y_test,predictionsj))

print("original:")
print(confusion_matrix(y_test1,predictions1j))
print(classification_report(y_test1,predictions1j))
print(accuracy_score(y_test1,predictions1j))
