### Importing libraries

In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [6]:
import re 

In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [8]:
import nltk
import seaborn as sns

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [11]:
import pickle

In [12]:
from wordcloud import WordCloud

## Reading in the dataset

In [14]:
data = pd.read_csv('labeled_data.csv', on_bad_lines='skip')


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24781 entries, 0 to 24780
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24781 non-null  int64 
 1   count               24781 non-null  int64 
 2   hate_speech         24781 non-null  int64 
 3   offensive_language  24781 non-null  int64 
 4   neither             24781 non-null  int64 
 5   class               24781 non-null  int64 
 6   tweet               24781 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


## Pre processing the data

In [16]:
def data_processing(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"https\S+|www\S+http\S+", '', tweet, flags = re.MULTILINE)
    tweet = re.sub(r'\@w+|\#','', tweet)
    tweet = re.sub(r'[^\w\s]','',tweet)
    tweet = re.sub(r'ð','',tweet)
    tweet_tokens = word_tokenize(tweet)
    filtered_tweets = [w for w in tweet_tokens if not w in stop_words]
    return " ".join(filtered_tweets)

In [17]:
data.tweet = data['tweet'].apply(data_processing)

In [18]:
data = data.drop_duplicates('tweet')

In [19]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    tweet = [lemmatizer.lemmatize(word) for word in data]
    return data

In [20]:
data['tweet'] = data['tweet'].apply(lambda x: lemmatizing(x))

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24696 entries, 0 to 24780
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24696 non-null  int64 
 1   count               24696 non-null  int64 
 2   hate_speech         24696 non-null  int64 
 3   offensive_language  24696 non-null  int64 
 4   neither             24696 non-null  int64 
 5   class               24696 non-null  int64 
 6   tweet               24696 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.5+ MB


## Train Test split

In [22]:
vect=TfidfVectorizer(ngram_range=(1,2)).fit(data['tweet'])

In [23]:
X = data['tweet']

In [24]:
y = data['class']

In [25]:
X = vect.transform(X)

In [26]:
X

<24696x189321 sparse matrix of type '<class 'numpy.float64'>'
	with 415055 stored elements in Compressed Sparse Row format>

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [28]:
print("Size of X_train", (X_train.shape))
print("Size of Y_train", (y_train.shape))
print("Size of X_test", (X_test.shape))
print("Size of Y_test", (y_test.shape))

Size of X_train (19756, 189321)
Size of Y_train (19756,)
Size of X_test (4940, 189321)
Size of Y_test (4940,)


## Model 1: Logistics Regression

In [29]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train) #loading x_train and y_train data on model
logreg_predict = logreg.predict(X_test) #predicting the value for test data
logreg_acc = accuracy_score(logreg_predict, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
logreg_acc

0.8663967611336032

In [31]:
test = "i love to display my affection its wholesome"
test_pre = data_processing(test)
test_vec = vect.transform([test_pre])

In [32]:
logreg.predict(test_vec)

array([1], dtype=int64)

## Model 2 Logistics Regression with hyperparameter tuning

In [33]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [34]:
param_grid = {'C':[100, 10, 1.0, 0.1, 0.01], 'solver' :['newton-cg', 'lbfgs','liblinear']}
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid.fit(X_train, y_train)
print("Best Cross validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best Cross validation score: 0.89
Best parameters:  {'C': 100, 'solver': 'lbfgs'}


In [35]:
print("done")

done


In [36]:
log_grid_pred = grid.predict(X_test)

In [37]:
log_grid_acc = accuracy_score(log_grid_pred, y_test)
print("Test accuracy: {:.2f}%".format(log_grid_acc*100)) #printing model accuracy after applying hyperparamenter tuning

Test accuracy: 89.07%


In [38]:
grid.predict(test_vec)

array([1], dtype=int64)

## Model 3: Decision tree classifier

In [39]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train) #loading x_train and y_train data on model
dtree_pred = dtree.predict(X_test) #predicting the value for test data
dtree_acc = accuracy_score(dtree_pred, y_test)

In [41]:
dtree_acc

0.879757085020243

## Model 4: Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rfc = RandomForestClassifier()

In [44]:
rfc.fit(X_train, y_train)

In [45]:
rfc_pred = rfc.predict(X_test)

In [54]:
rfc_ac = accuracy_score(y_test, rfc_pred)

In [56]:
rfc_ac = rfc_ac
rfc_ac*100

87.04453441295547

## Model 5 XG boost

In [48]:
from xgboost import XGBClassifier

# Instantiate the XGBClassifier
xgb = XGBClassifier()

# Fit the model to the training data
xgb.fit(X_train, y_train)

# Predict on the test data
xgb_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, xgb_pred)
print("Test accuracy:", accuracy)

Test accuracy: 0.8935222672064778


In [49]:
xgb.predict(test_vec)

array([2], dtype=int64)

## Exporting the models

In [93]:
pickle.dump(xgb, open('model2.pkl', 'wb'))

In [52]:
test = "this isnt any normal method this is the best way to success"
test_pre = data_processing(test)
test_vec = vect.transform([test_pre])

In [46]:
pickle.dump(grid, open('modelgrid.pkl', 'wb'))

In [47]:
modeltest = pickle.load(open('modelgrid.pkl', 'rb'))

In [53]:
modeltest.predict(test_vec)

array([2], dtype=int64)

In [54]:
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vect, vectorizer_file)

In [69]:
test = "what classifies as hateful need not be always hateful"
test_pre = data_processing(test)
test_vec = vect.transform([test_pre])

In [70]:
dtree.predict(test_vec)

array([2], dtype=int64)

In [71]:
xgb.predict(test_vec)

array([2], dtype=int64)

In [72]:
pickle.dump(xgb, open('xgb_model.pkl', 'wb'))

In [74]:
model_xgb = pickle.load(open('xgb_model.pkl', 'rb'))

model_xgb.predict(test_vec)

In [75]:
model_xgb.predict(test_vec)

array([2], dtype=int64)

## Tabulating results

In [57]:
dict1 = {"Model Name": ["Logistic Regression", "Logistic Regression with Hyperparamter tuning", "Decision Tree Classifier", "Random Forest Regression", "XG Boost"], 
         "accuracy": [logreg_acc, log_grid_acc, dtree_acc, rfc_ac, accuracy]}

In [58]:
import pandas as pd
d1 = pd.DataFrame(dict1)

In [59]:
d1

Unnamed: 0,Model Name,accuracy
0,Logistic Regression,0.866397
1,Logistic Regression with Hyperparamter tuning,0.890688
2,Decision Tree Classifier,0.879757
3,Random Forest Regression,0.870445
4,XG Boost,0.893522
