### Importing libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
import re 

In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
import nltk
import seaborn as sns

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [7]:
import pickle

In [8]:
from wordcloud import WordCloud

## Reading in the dataset

In [9]:
data = pd.read_csv('labeled_data.csv')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [11]:
def data_processing(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"https\S+|www\S+http\S+", '', tweet, flags = re.MULTILINE)
    tweet = re.sub(r'\@w+|\#','', tweet)
    tweet = re.sub(r'[^\w\s]','',tweet)
    tweet = re.sub(r'ð','',tweet)
    tweet_tokens = word_tokenize(tweet)
    filtered_tweets = [w for w in tweet_tokens if not w in stop_words]
    return " ".join(filtered_tweets)

In [12]:
data.tweet = data['tweet'].apply(data_processing)

In [13]:
data = data.drop_duplicates('tweet')

In [14]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    tweet = [lemmatizer.lemmatize(word) for word in data]
    return data

In [15]:
data['tweet'] = data['tweet'].apply(lambda x: lemmatizing(x))

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24698 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24698 non-null  int64 
 1   count               24698 non-null  int64 
 2   hate_speech         24698 non-null  int64 
 3   offensive_language  24698 non-null  int64 
 4   neither             24698 non-null  int64 
 5   class               24698 non-null  int64 
 6   tweet               24698 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.5+ MB


In [17]:
vect=TfidfVectorizer(ngram_range=(1,2)).fit(data['tweet'])

In [18]:
X = data['tweet']

In [19]:
y = data['class']

In [20]:
from sklearn.pipeline import make_pipeline 

In [25]:
pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,2)))

In [26]:
X = vect.transform(X)

In [27]:
X

<24698x189343 sparse matrix of type '<class 'numpy.float64'>'
	with 415095 stored elements in Compressed Sparse Row format>

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [29]:
print("Size of X_train", (X_train.shape))
print("Size of Y_train", (y_train.shape))
print("Size of X_test", (X_test.shape))
print("Size of Y_test", (y_test.shape))

Size of X_train (19758, 189343)
Size of Y_train (19758,)
Size of X_test (4940, 189343)
Size of Y_test (4940,)


## Model 1: Logistics Regression

In [30]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train) #loading x_train and y_train data on model
logreg_predict = logreg.predict(X_test) #predicting the value for test data
logreg_acc = accuracy_score(logreg_predict, y_test)

In [31]:
logreg_acc

0.8651821862348178

In [40]:
test = "i love to display my affection its wholesome"
test_pre = data_processing(test)
test_vec = vect.transform([test_pre])

In [33]:
logreg.predict(test_vec)

array([0], dtype=int64)

## Model 2 Logistics Regression with hyperparameter tuning

In [34]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [35]:
param_grid = {'C':[100, 10, 1.0, 0.1, 0.01], 'solver' :['newton-cg', 'lbfgs','liblinear']}
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid.fit(X_train, y_train)
print("Best Cross validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best Cross validation score: 0.89
Best parameters:  {'C': 100, 'solver': 'lbfgs'}


In [36]:
print("done")

done


In [37]:
log_grid_pred = grid.predict(X_test)

In [38]:
log_grid_acc = accuracy_score(log_grid_pred, y_test)
print("Test accuracy: {:.2f}%".format(log_grid_acc*100)) #printing model accuracy after applying hyperparamenter tuning

Test accuracy: 89.47%


In [41]:
grid.predict(test_vec)

array([1], dtype=int64)

## Model 3: Decision tree classifier

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train) #loading x_train and y_train data on model
dtree_pred = dtree.predict(X_test) #predicting the value for test data
dtree_acc = accuracy_score(dtree_pred, y_test)

In [44]:
dtree_acc

0.8811740890688259

## Model 4: Random Forest

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
print("hello")

hello


## Model 5 XG boost

In [74]:
from xgboost import XGBClassifier

# Instantiate the XGBClassifier
xgb = XGBClassifier()

# Fit the model to the training data
xgb.fit(X_train, y_train)

# Predict on the test data
xgb_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, xgb_pred)
print("Test accuracy:", accuracy)

Test accuracy: 0.9006072874493927


In [77]:
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
#     'min_child_weight': [1, 3, 5, 7],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# }

# grid = GridSearchCV(xgb, param_grid, cv=5)
# grid.fit(X_train,y_train)

# print("Best Cross validation score: {:.2f}".format(grid.best_score_))
# print("Best parameters: ", grid.best_params_)

# xgb_hyper_pred = grid.predict(X_test)
# accuracy = accuracy_score(y_test, xgb_hyper_pred)
# print("Test accuracy:", accuracy)

In [91]:
xgb.predict(test_vec)

array([0], dtype=int64)

In [93]:
pickle.dump(xgb, open('model2.pkl', 'wb'))

In [1]:
import pickle

In [2]:
model = pickle.load(open('model2.pkl', 'rb'))

In [33]:
test = "gay ass nigger"
test_pre = data_processing(test)
test_vec = vect.transform([test_pre])

In [34]:
model.predict(test_vec)

array([0], dtype=int64)

In [45]:
import pickle

In [46]:
pickle.dump(grid, open('modelgrid.pkl', 'wb'))

In [47]:
modeltest = pickle.load(open('modelgrid.pkl', 'rb'))

In [48]:
modeltest.predict(test_vec)

array([1], dtype=int64)