In [168]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
1#
df=pd.read_csv('C:/Users/Admin/Desktop/AI&DS/TwitterHate.csv',sep=',')

In [190]:
df.shape

(31962, 3)

In [7]:
df.head(2)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...


In [17]:
#2. Get the tweets into a list for easy text cleanup and manipulation.
tweet=[df.iloc[i,2] for i in range(df.shape[0])]
len(tweet)

31962

In [109]:
#3. Cleaning up
def clean(twe):
    A=twe.lower()
    A=re.sub(r"@\S+", "", A)
    A=re.sub(r"http\S+", "", A)
    A=re.sub(r"#","",A)
    A=re.sub(r'\b\w{,1}\b', '', A) 
    return A

tweet=[clean(sen) for sen in tweet]



In [110]:
#tokenization
tknzr=TweetTokenizer()
tweet_token=[tknzr.tokenize(text) for text in tweet]

stop_word=set(stopwords.words('english'))
tweet_token2=[]
for tex in tweet_token:
    out=[i for i in set(tex) if not i in stop_word]
    tweet_token2.append(out)

In [111]:
tweet_token2[0:2]



[['dysfunction',
  'father',
  'drags',
  'run',
  'selfish',
  '.',
  'dysfunctional',
  'kids'],
 ['offer',
  'use',
  'getthanked',
  'pdx',
  '.',
  'vans',
  'disapointed',
  'credit',
  'lyft',
  'wheelchair',
  'thanks',
  'cause',
  "'"]]

In [115]:
#4 we remove terms with a length of 1.
tweet_token3=[]
for tex in tweet_token2:
    out=[i for i in tex if len(i)>1]
    tweet_token3.append(out)

In [191]:
tweet_token3[0]

['dysfunction', 'father', 'drags', 'run', 'selfish', 'dysfunctional', 'kids']

In [193]:
#5. Check out the top terms in the tweets:
#5.1 First, get all the tokenized terms into one large list.
tweet_liste=[]
for text in tweet_token3:
    for i in text:
        tweet_liste.append(i)


#5.2 Use the counter and find the 10 most common terms.
dico={}
for i in tweet_liste:
    a=dico.get(i,0)
    dico[i]=a+1

print('The 10 most common terms with their frequences are')
for i in range(10):
    max_key = max(dico, key=dico.get)
    print(max_key, dico[max_key])
    del(dico[max_key])


The 10 most common terms with their frequences are
love 2526
... 2520
day 2118
happy 1587
time 1099
today 1059
like 1037
life 1037
positive 913
new 912


In [194]:
#6.Data formatting for predictive modeling:
#6.1 Join the tokens back to form strings. 
tweet2=[(' ').join(twe)  for twe in tweet_token3]


#6.2 Assign x and y.
X=tweet2; y=df['label']

#Perform train_test_split using sklearn.
x_train,x_test, y_train,y_test=train_test_split(X,y,random_state=1)

In [197]:
#7. We’ll use TF-IDF values for the terms as a feature to get into a vector space model.
tf=TfidfVectorizer(decode_error='ignore',lowercase=False,max_features=5000)

x_traintf=tf.fit_transform(x_train)

x_testtf=tf.fit_transform(x_test)

In [198]:
#8.Model building: Ordinary Logistic Regression
clf = LogisticRegression(random_state=0)

#Fit into  the train data.
clf.fit(x_traintf,y_train)

#Make predictions for the train and the test set.
clf.predict(x_traintf)
clf.predict(x_testtf)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [201]:
#9. Model evaluation: Accuracy, recall, and f_1 score.

#Report the accuracy on the train set.
score = clf.score(x_traintf, y_train)
print('The accuracy is', score)

#Report the recall on the train set: decent, high, or low.
#Get the f1 score on the train set.
y_pred=clf.predict(x_traintf)
print(classification_report(y_train, y_pred))
print('the recall is high however the model focuses a lot on 0 class')

The accuracy is 0.9556964665637645
             precision    recall  f1-score   support

          0       0.96      1.00      0.98     22269
          1       0.96      0.39      0.56      1702

avg / total       0.96      0.96      0.95     23971

the recall is high however the model focuses a lot on 0 class


In [202]:
#10. Adjust the appropriate class in the LogisticRegression model.
weights = {0:1.0, 1:10.0}
clf2 = LogisticRegression(solver='lbfgs', class_weight=weights)

In [203]:
#11. Train again with the adjustment and evaluate.
#Evaluate the predictions on the train set: accuracy, recall, and f_1 score.
clf2.fit(x_traintf,y_train)
y_pred2=clf2.predict(x_traintf)

score = clf2.score(x_traintf, y_train)
print(score)
print(classification_report(y_train, y_pred2))

0.9632472571023319
             precision    recall  f1-score   support

          0       1.00      0.96      0.98     22269
          1       0.67      0.97      0.79      1702

avg / total       0.97      0.96      0.97     23971



In [204]:
#We can provide best weights using grid search
model = LogisticRegression(solver='lbfgs')
balance = [{0:1,1:10}, {0:1,1:100}, {0:1,1:1}, {0:10,1:1}, {0:100,1:1}]
param_grid = dict(class_weight=balance)
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=1, random_state=1)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='recall')
grid_result = grid.fit(x_traintf,y_train)

In [205]:
grid_result.best_score_

0.8719241658004988

In [206]:
#12. Regularization and Hyperparameter tuning:

parameters = {'penalty':('l1', 'l2'), 'C':[1, 10]}
clf = LogisticRegression(random_state=0,class_weight='balanced')


In [207]:
#13 Find the parameters with the best recall in cross validation.
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=1, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=parameters, n_jobs=-1, cv=cv, scoring='recall')

grid_result = grid.fit(x_traintf,y_train)


In [208]:
#14  best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.783199 using {'C': 1, 'penalty': 'l2'}


In [212]:
#15 Predict and evaluate using the best estimator.
clf = LogisticRegression(random_state=0,class_weight={0:1.0, 1:100.0},penalty='l2',C=1)
clf.fit(x_traintf,y_train)
y_pred2=clf.predict(x_testtf)

score = clf.score(x_testtf, y_test)
print(score)
print(classification_report(y_test, y_pred2))

0.6501063696658741
             precision    recall  f1-score   support

          0       0.94      0.67      0.78      7451
          1       0.08      0.40      0.13       540

avg / total       0.88      0.65      0.74      7991

