In [1]:
import pandas as pd
import numpy as np
import re
from nltk import WordNetLemmatizer
import spacy
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTEN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [2]:
df=pd.read_csv('H:/ExcelR Data Science/Project P72/emails1_edit.txt')
df.drop(['Unnamed: 0', 'filename', 'Message-ID'],axis=1,inplace=True)
df.head()

Unnamed: 0,content,Class
0,eat shit\r\n\r\n\r\n\r\n\r\nJohn J Lavorato@ex...,Abusive
1,fuck you,Abusive
2,Gentlemen:\r\nThe following champagne is avail...,Abusive
3,sorry i've taken so long...just been trying to...,Abusive
4,asshole\r\n\r\n\r\n\r\n\r\nJohn J Lavorato@exc...,Abusive


In [3]:
# Dropping duplicates
df.drop_duplicates(keep='first',inplace=True)
df.reset_index(drop=True,inplace=True)

In [4]:
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words
adds= ['subject','image']
sw_spacy.update(adds)

In [5]:
def preprocess_text(text):
    
    # removing email id tags
    text=re.sub('\S*@\S*\s?',' ',text)
    
    # Removing url
    text=re.sub(r'http\S+', ' ',text)
    
    # removing r\n\ pattern
    text=re.sub('[\r\n]+', ' ',text)
    
    # removing numbers and special characters
    text=re.sub('[^A-Za-z ]+', ' ',text)
    
    # removing words beginning with capital letters
    text= re.sub('([^.])( [A-Z]\w*)', r'\1',text)
    
    # Removing words less than lenghth 3
    short = re.compile(r'\W*\b\w{1,3}\b')
    text= short.sub(' ',text)
    
    # Converting to lowercase
    text=text.lower()
    
    # Lemmatize
    lm = WordNetLemmatizer()
    text= lm.lemmatize(text) 
    
    # Stopwords removal
    text = ' '.join([word for word in text.split() if word not in sw_spacy])

    # Blank lines
    text=re.sub(r'^$\n', '', text, flags=re.MULTILINE)
    
    # blank spaces
    text=' '.join([line for line in text.split('\n') if line.strip() != ''])
    #text=' '.join()
    return text

In [6]:
df['cleaned_text']= df['content'].apply(preprocess_text)

In [7]:
# Dropping content column
df.drop('content',axis=1,inplace=True)
df.rename(columns={'cleaned_text':'Text'},inplace=True)
df.head()

Unnamed: 0,Class,Text
0,Abusive,shit bets clev
1,Abusive,fuck
2,Abusive,gentlemen following champagne available approx...
3,Abusive,sorry taken long trying fend chicks life soooo...
4,Abusive,asshole john cant gambling problem away bills ...


In [8]:
# Checking for empty strings
count=0
index=[]
for i in range (len(df['Text'])):
    if not(len(df['Text'][i].strip())):
        index.append(i)
        count=count+1
print(count)      

244


In [9]:
# Dropping rows containing empty strings
df.drop(index,inplace=True)

df.reset_index(drop=True,inplace=True)

print('Shape of dataframe after dropping empty strings:',df.shape)

Shape of dataframe after dropping empty strings: (24412, 2)


In [18]:
X= df.iloc[:,1]
y= df.iloc[:,0]

In [19]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,stratify= y,random_state=1)

print('Shape of training data:',(X_train.shape))
print('Shape of testing data:',(X_test.shape))

Shape of training data: (17088,)
Shape of testing data: (7324,)


In [20]:
print('Distribution of y_train before resampling:',Counter(y_train))

# Converting X_train from series to 2D array
X_train= X_train.values.reshape(-1,1)

# Oversampling
oversample = SMOTEN(sampling_strategy='minority')
X_train, y_train = oversample.fit_resample(X_train, y_train)
print('Distribution of y_train after resampling:',Counter(y_train))

Distribution of y_train before resampling: Counter({'Non Abusive': 15941, 'Abusive': 1147})
Distribution of y_train after resampling: Counter({'Non Abusive': 15941, 'Abusive': 15941})


In [21]:
## Label Encoding Y
le= LabelEncoder()
le.fit(y_train)
y_train= le.transform(y_train)
y_test = le.transform(y_test)

print(le.classes_)

['Abusive' 'Non Abusive']


In [22]:
X_train1= X_train.copy()
X_test1= X_test.copy()
y_train1= y_train.copy()
y_test1= y_test.copy()

In [23]:
# Converting 2D array to 1D array
X_train1= X_train1.ravel()
X_test1= X_test1.ravel()

In [24]:
tf= TfidfVectorizer()

tf.fit(X_train1)

X_train1=tf.transform(X_train1)
X_test1=tf.transform(X_test1)

In [25]:
from sklearn.svm import LinearSVC
lsvc= LinearSVC()
lsvc.fit(X_train1,y_train)

lsvc_pred= lsvc.predict(X_test1)

names=['0-Abusive','1-Non Abusive']
print(classification_report(y_test, lsvc_pred,target_names=names))

               precision    recall  f1-score   support

    0-Abusive       0.92      0.66      0.77       491
1-Non Abusive       0.98      1.00      0.99      6833

     accuracy                           0.97      7324
    macro avg       0.95      0.83      0.88      7324
 weighted avg       0.97      0.97      0.97      7324



In [26]:
import pickle
pickle.dump(tf, open('final_vectorizer.pkl','wb'))
pickle.dump(lsvc,open('final_lsvc_model.pkl','wb'))