In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np



# Load Data 

In [2]:
import pandas as pd #Pandas library used to read data from csv file into Dataframe

#Load Training Data online
url = 'https://drive.google.com/file/d/1_qtDnJKZOvTJ84HkHXzjt4C_jOLH9pOq/view?usp=sharing'
url2='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url2)


#Another way to read data is by uploading the file then reading, we will cover this in next workshop


In [3]:
df.head() #checkout the contents of data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
df.shape #Check how many rows & columns in data

(40000, 2)

In [5]:
#check the number of positive and negative classes 

df['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

# Vectorize Data

In [6]:
#This shows that data is balanced. Now moving towards processing data


from sklearn.feature_extraction.text import TfidfVectorizer
# learn more about tfidf vectorizer and details of library https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


corpus=df['text'] #using only text column from dataset to create features, not using labels

vectorizer = TfidfVectorizer(min_df=20,stop_words='english')  #remove stopwords that are in English dictionary 

#additional line if you want to remove a specific word

X = vectorizer.fit_transform(corpus)

Xarray=X.toarray()
	
print("Collection vectorized. %d rows and %d cols." % Xarray.shape )


trainx=Xarray #saving the resulting features as trainx 


trainy=df['label'].values #converting target value (labels) into an array





Collection vectorized. 40000 rows and 15492 cols.


In [7]:
#Testing
turl = 'https://drive.google.com/file/d/1xe7E0KJiPN2sscVa7OV0GFWxLjKcXU_1/view?usp=sharing'
turl2='https://drive.google.com/uc?id=' + turl.split('/')[-2]

test=pd.read_csv(turl2)

test.shape

(5000, 2)

In [8]:
#test data


textarr=test['text'].values

testx = vectorizer.transform(textarr)


testy=test['label'].values #converting dataframe column into an array for actual predictions 

In [9]:
print("Training Features: %d rows and %d cols." % trainx.shape )
print("Test Features: %d rows and %d cols." % testx.shape )

Training Features: 40000 rows and 15492 cols.
Test Features: 5000 rows and 15492 cols.


In [11]:
print(trainx[0])

[0. 0. 0. ... 0. 0. 0.]


In [20]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [21]:
print(trainx[0])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.14687152 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

# Apply Algorithm

In [22]:
from sklearn.linear_model import LogisticRegression
#check out for more details and parameters: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


lr= LogisticRegression()

#lr= LogisticRegression(penalty='l1',solver='liblinear',class_weight="balanced")

 
lr= lr.fit(trainx,trainy)

In [23]:
lr.score(trainx, trainy) #check accuracy

0.92485

# Test Model

In [24]:
testy=test['label'].values #converting dataframe column into an array

textarr=test['text'].values


testx = vectorizer.transform(textarr)

print(testx.shape)

(5000, 15492)


In [25]:
p = lr.predict(testx)



print(lr.score(testx, testy))

0.8938


In [26]:
print("Accuracy on train data: %s" % lr.score(trainx, trainy))
print("Accuracy on test data: %s" % lr.score(testx, testy))

Accuracy on train data: 0.92485
Accuracy on test data: 0.8938


In [27]:
from sklearn.metrics import classification_report


print(classification_report(testy,p, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      2495
           1       0.88      0.91      0.90      2505

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



# Save model for further use

In [None]:
import pickle 


learn_file='fittedLRModel_%d' % (1)


lr= LogisticRegression()
lr= lr.fit(trainx,trainy)
with open(learn_file,'wb') as f:
   pickle.dump([lr,vectorizer],f)

print("Base Classifier Built and stored in %s!!" %learn_file)

# Additional optional read - what to do if data is unbalanced

In [12]:
#grid search for parameters
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np



parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(trainx, trainy)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best parameters:  {'C': 5.263252631578947}
best scrores:  0.8538


In [14]:
lr_clf = LogisticRegression(C=5.2632)
lr_clf.fit(trainx, trainy)



LogisticRegression(C=5.2632)

In [24]:
#df=df2

positiveclass=df['label'].value_counts()
posnum= positiveclass[1] #number of positives in example


# cross validation with the train subset
# 2 fold at the moment....

nfold=4
Fbeta=1.0
#Cs to be tested
Cs=[2**y for y in range(-10,10)]

# matrices to store the results. nrows: number of Cs. ncols: number of folds
perfacc=np.zeros([len(Cs),nfold])
# F1 wrt the positive class
perfF1=np.zeros([len(Cs),nfold])
perfP=np.zeros([len(Cs),nfold])
perfR=np.zeros([len(Cs),nfold])

print("Cross validating...")
print("Cs tested:"),
print(Cs)



for w in [2**x for x in range(10)]:
 print("Weight:%d"%w)
 for f in range(nfold):
  #print("Fold %d" % (f+1))
  ftest=[x for x in range(len(trainy)) if x%nfold==f]
  ftrain=[x for x in range(len(trainy)) if x%nfold!=f]
  for c in range(len(Cs)):
    #lr= LogisticRegression(penalty='l1',solver='liblinear',C=Cs[c], class_weight='balanced')
    lr= LogisticRegression(penalty='l1',solver='liblinear',C=Cs[c], class_weight={0:(1.0/(1.0+w)),1:(w/(1.0+w))})
    lr= lr.fit(trainx[ftrain,],trainy[ftrain])
    preds = lr.predict(trainx[ftest,])
    acc = float(sum(preds==trainy[ftest,]))/float(len(preds))
    ipos = np.where(trainy[ftest,]==1)
    ineg = np.where(trainy[ftest,]==0)
    TP = sum(preds[ipos]==trainy[ftest,][ipos])
    FP = sum(preds[ineg]!=trainy[ftest,][ineg])
    TN = sum(preds[ineg]==trainy[ftest,][ineg])
    FN = sum(preds[ipos]!=trainy[ftest,][ipos])
    if (TP!=0): 
      P = float(TP)/float(TP+FP)
    else:
      P = 0
    if (TP!=0): 
      R = float(TP)/float(TP+FN)
    else:
      R = 0
    perfacc[c,f]=acc
    perfP[c,f]=P
    perfR[c,f]=R
    if (TP!=0): perfF1[c,f]=(1+Fbeta**2) * (P*R)/((Fbeta**2)*P+R)
    else: perfF1[c,f]=0

 meansF1=np.mean(perfF1,axis=1)
 meansP=np.mean(perfP,axis=1)
 meansR=np.mean(perfR,axis=1)
 bestC=np.argmax(meansF1)

 print("Best C:%f, F_%f:%f, P:%f, R:%f (Tpos:%d, pos:%d)" %(Cs[bestC],Fbeta,meansF1[bestC],meansP[bestC],meansR[bestC], round(meansR[bestC]*posnum), round(round(meansR[bestC]*posnum)/meansP[bestC]) ))

 #for c in range(len(Cs)):
 # print("C:%f,Acc:%f, F1:%f" % (Cs[c],np.mean(perfacc,axis=1)[c],np.mean(perfF1,axis=1)[c]) )


Cross validating...
Cs tested:
[0.0009765625, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
Weight:1
Best C:4.000000, F_1.000000:0.848969, P:0.831800, R:0.866977 (Tpos:2191, pos:2634)
Weight:2
Best C:8.000000, F_1.000000:0.852966, P:0.801593, R:0.911979 (Tpos:2305, pos:2876)
Weight:4
Best C:16.000000, F_1.000000:0.846274, P:0.782572, R:0.921945 (Tpos:2330, pos:2977)
Weight:8
Best C:32.000000, F_1.000000:0.842108, P:0.771320, R:0.927897 (Tpos:2345, pos:3040)
Weight:16
Best C:64.000000, F_1.000000:0.839557, P:0.763739, R:0.932612 (Tpos:2357, pos:3086)
Weight:32
Best C:128.000000, F_1.000000:0.837850, P:0.759023, R:0.935372 (Tpos:2364, pos:3115)
Weight:64
Best C:512.000000, F_1.000000:0.835462, P:0.769800, R:0.913660 (Tpos:2309, pos:2999)
Weight:128
Best C:512.000000, F_1.000000:0.832221, P:0.748015, R:0.938119 (Tpos:2371, pos:3170)
Weight:256
Best C:512.000000, F_1.000000:0.816037, P:0.707233, R:0.964853 (Tpos:2438

In [28]:
import pickle #Fit on 5k rows data


optimalC=8
optimalw=2

learn_file='fittedLRModel_C_%d_w_%d' % (optimalC,optimalw)


lr= LogisticRegression(penalty='l1',solver='liblinear',C=optimalC, class_weight={0:(1.0/(1.0+optimalw)),1:   (optimalw/(1.0+optimalw))})
lr= lr.fit(trainx,trainy)
with open(learn_file,'wb') as f:
   pickle.dump([lr,vectorizer],f)

print("Base Classifier Built and stored in %s!!" %learn_file)

Base Classifier Built and stored in fittedLRModel_C_8_w_2!!


0.5502

In [25]:
corpus=df['text']

vectorizer = TfidfVectorizer(min_df=20,stop_words='english')

X = vectorizer.fit_transform(corpus)
Xarray=X.toarray()
	
print("Collection vectorized. %d rows and %d cols." % Xarray.shape )
Xarray


trainy=df['label'].values #converting dataframe column into an array

trainx=Xarray

#-------


Collection vectorized. 40000 rows and 15492 cols.


In [8]:
#Without any class-weight 

#optimalC=8
#optimalw=32



#lr= LogisticRegression(penalty='l1',solver='liblinear',C=optimalC, class_weight='balanced')

lr= LogisticRegression(penalty='l1',solver='liblinear', class_weight='balanced')


lr= lr.fit(trainx,trainy)


testy=test['label'].values #converting dataframe column into an array

textarr=test['text'].values


testx = vectorizer.transform(textarr)
p = lr.predict(testx)

print(testx.shape)

print(lr.score(testx, testy))


from sklearn.metrics import classification_report


print(classification_report(testy,p, target_names=['0','1']))

(5000, 15492)
0.8832
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      2495
           1       0.87      0.90      0.89      2505

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

