In [1]:
!pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import time

In [9]:
df = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding = "latin-1", low_memory=False)

In [11]:
df.head()

Unnamed: 0,ids,date,user,text,target
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,0
1,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,0
2,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,0
3,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....",0
4,1467811372,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,@Kwesidei not the whole crew,0


In [12]:
# randomly sample 25000 rows with target == 0, and 25000 rows with target == 4
neg_df = df[df.target == 0].sample(n=25000, random_state=5)
pos_df = df[df.target == 4].sample(n=25000, random_state=5)

In [13]:
neg_df.head()

Unnamed: 0,ids,date,user,text,target
740007,2265878782,Sun Jun 21 07:29:15 PDT 2009,MeeJong,This morning my daughter asked me if I hate my...,0
562036,2205565064,Wed Jun 17 05:00:35 PDT 2009,kathysyahrizal,"I forgot to charge my bb, zzzzz only 35% left",0
318661,2002872654,Tue Jun 02 05:22:07 PDT 2009,kgautam,my MBP battery is fluctuating between dead to ...,0
131276,1835496893,Mon May 18 06:07:33 PDT 2009,thejanice,i really wish my landlord would call me back. ...,0
39699,1573596472,Tue Apr 21 01:55:46 PDT 2009,lucyxechelon,mean blog people,0


In [14]:
pos_df.head()

Unnamed: 0,ids,date,user,text,target
1129018,1975455456,Sat May 30 15:11:36 PDT 2009,MELi_BELLY,is celebrating people that I love! --&gt; Lov...,4
1075997,1967341404,Fri May 29 19:24:16 PDT 2009,Vinamae,with my babe. bye!,4
1153687,1978844458,Sun May 31 00:40:03 PDT 2009,BrinaM,Morning all. Gorgeous sunny day again here and...,4
1388967,2053073343,Sat Jun 06 03:09:58 PDT 2009,CrashLiverar,@JoanneDuran It looks like you made 900 follow...,4
1341127,2031190204,Thu Jun 04 09:27:46 PDT 2009,eyaustin,@LynzMichelle Its exciting because we had a lo...,4


In [15]:
# combine neg_df and pos_df
use_df = pd.concat([neg_df, pos_df], axis=0)

In [16]:
pd.unique(use_df.target)

array([0, 4])

In [17]:
# print number of 0s and 4s
print("Number of 0s: ", len(use_df[use_df.target == 0]))
print("Number of 4s: ", len(use_df[use_df.target == 4]))

Number of 0s:  25000
Number of 4s:  25000


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [19]:
# train test split
train_x, test_x, train_y, test_y = train_test_split(use_df['text'], use_df['target'], test_size=0.2, random_state=5)

In [20]:
# tfidf
tfidf = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True)
train_x_tfidf = tfidf.fit_transform(train_x)
test_x_tfidf = tfidf.transform(test_x)

In [21]:
# logistic regression
from sklearn.linear_model import LogisticRegression
def logistic_regression(train_x, train_y):
  classifier = LogisticRegression(max_iter=100000)
  classifier.fit(train_x, train_y)
  return classifier

# svm
from sklearn.svm import SVC
def svm(train_x, train_y):
    classifier = SVC(max_iter=100000, C=1.0)
    classifier.fit(train_x, train_y)
    return classifier

# knn
from sklearn.neighbors import KNeighborsClassifier
def knn(train_x, train_y):
  classifier = KNeighborsClassifier()
  classifier.fit(train_x, train_y)
  return classifier

# decision tree
from sklearn import tree
def decision_tree(train_x, train_y):
  classifier = tree.DecisionTreeClassifier()
  classifier.fit(train_x, train_y)
  return classifier

In [22]:
# run logistic regression
import time
start_time = time.time()
classifier = logistic_regression(train_x_tfidf, train_y)
log_predictions = classifier.predict(test_x_tfidf)
print(accuracy_score(test_y, log_predictions))
print("--- %s seconds ---" % (time.time() - start_time))

0.7757
--- 0.3509812355041504 seconds ---


In [23]:
# run svm
start_time = time.time()
classifier = svm(train_x_tfidf, train_y)
predictions = classifier.predict(test_x_tfidf)
print(accuracy_score(test_y, predictions))
print("--- %s seconds ---" % (time.time() - start_time))

0.7782
--- 385.6909034252167 seconds ---


In [24]:
# run knn
start_time = time.time()
classifier = knn(train_x_tfidf, train_y)
predictions = classifier.predict(test_x_tfidf)
print(accuracy_score(test_y, predictions))
print("--- %s seconds ---" % (time.time() - start_time))

0.5713
--- 9.361361026763916 seconds ---


In [25]:
# run decision tree
start_time = time.time()
classifier = decision_tree(train_x_tfidf, train_y)
predictions = classifier.predict(test_x_tfidf)
print(accuracy_score(test_y, predictions))
print("--- %s seconds ---" % (time.time() - start_time))

0.6795
--- 9.885319471359253 seconds ---
