In [None]:
# Downloading the hate speech csv file from Google Drive
!gdown 1oePItYlQYpQzG4FBreSFEIc7i8F662Vt

In [None]:
# Downloading the bad words csv file from Google Drive
!gdown 1ip1hr4trQ19S1ecC6Aea09PiTcxUhmlV

In [None]:
from csv import DictReader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import time

In [None]:
# Reading the dataset file with pandas
hatefuldataset = pd.read_csv("labeled_tweet_data.csv")

In [None]:
hatefuldataset.info()

In [None]:
datasetFilter = hatefuldataset[['class','tweet']]
class_label_value = (datasetFilter['class'].values).ravel()

In [None]:
# Train and test data split
dataTrain, dataTest = train_test_split(datasetFilter, test_size = 0.2, train_size = 0.8,  random_state = 30, stratify = datasetFilter['class'])

# Shape of train and test data
print(dataTrain.shape)
print(dataTest.shape)

In [None]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

In [None]:
# Stopwords that will be removed from strings
finalStopwords = nltk.corpus.stopwords.words("english")
finalStopwords.extend(['#ff', 'ff', 'rt'])
finalStopwords.remove('not')

In [None]:
def preprocessTweet(tweetInfo):
    ''' The function cleans up tweets by removing irrelevant information, derivational affixes and punctuation marks. '''
    processedArr = []

    # stemmer cuts off the ends of words (includes derivational affixes).
    stemmer = PorterStemmer()
    for each in tweetInfo:
        # Removing mentions (starts with @)
        process1 = re.sub(r'@[A-Za-z0-9_]+', ' ', each)

        # Removing retweets
        process2 = re.sub(r'RT', ' ', process1)

        # Removing links
        process3 = re.sub(r'https?', ' ', process2)
        process4 = re.sub(r'https?://[A-Za-z0-9./]+', ' ', process3)

        # Removing punctuations
        process5 = re.sub(r'[^a-zA-Z]', ' ', process4)

        # Removing hashtags (starts with #)
        process6 = re.sub(r'&#[0-9]*',' ',process5)
        process7 = re.sub(r'#[A-Za-z0-9]+',' ',process6)

        # Removing image links
        process7 = re.sub(r'pic.twitter.com/[A-Za-z0-9./]+',' ',process7)

        # Turning every word to lowercase
        process7 = process7.lower()
        # Splitting words
        process8 = process7.split()

        # Stemming
        process9 = [stemmer.stem(word) for word in process8 if not word in finalStopwords if len(word) > 2]

        process9 = ' '.join(process9)
        processedArr.append(process9)

        finalProcess = np.array(processedArr)
        
    return finalProcess

In [None]:
def wordFrequencyDictionary(words):
    ''' The function calculates how often a word is used. '''
    freqDictionary = [(words.count(each))/len(words) for each in words]

    return dict(list(zip(words,freqDictionary)))

In [None]:
def sortDictionary(freqDictionary):
    ''' The function sorts the frequnecy of words dictionary. '''
    finalVersion = [(freqDictionary[each], each) for each in freqDictionary]
    
    finalVersion.sort()
    finalVersion.reverse()
    
    return finalVersion

In [None]:
def findCommonWord(words, x):
    ''' The function finds common words that are used. '''
    return ([each[1] for each in words])[:x]

In [None]:
processedArr = preprocessTweet(dataTrain['tweet'].values)

In [None]:
train2, valid2, train2_y, valid2_y = train_test_split(processedArr, dataTrain['class'], test_size = 0.2, random_state = 0, stratify=dataTrain['class'])

print(train2.shape)
print(valid2.shape)

In [None]:
hatefulTweets = [sentence for sentence, label in zip(train2, class_label_value) if label == 0]
hatefulWords = ' '.join(hatefulTweets)
hatefulWords = wordFrequencyDictionary(hatefulWords.split())
hatefulDictionary = sortDictionary(hatefulWords)

In [None]:
offensiveTweets = [sentence for sentence, label in zip(train2, class_label_value) if label == 1]
offensiveWords = ' '.join(offensiveTweets)
offensiveWords = wordFrequencyDictionary(offensiveWords.split())
offensiveDictionary = sortDictionary(offensiveWords)

In [None]:
neutralTweets = [sentence for sentence, label in zip(train2, class_label_value) if label == 2]
neutralWords = ' '.join(neutralTweets)
neutralWords = wordFrequencyDictionary(neutralWords.split())
neutralDictionary = sortDictionary(neutralWords)

In [None]:
common = list()

common.append(findCommonWord(hatefulDictionary, 4000))
common.append(findCommonWord(offensiveDictionary, 2000))
common.append(findCommonWord(neutralDictionary, 2000))

common = np.unique(np.hstack(common))

In [None]:
commonDictionary = ({i:j for i, j in zip(common, range(len(common)))})

## Tokenization

In [None]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

newTokenizer = TweetTokenizer() 
newVectorizer = CountVectorizer(analyzer = "word", vocabulary = commonDictionary, tokenizer = newTokenizer.tokenize)

train3 = newVectorizer.fit_transform(train2).toarray()
valid3 = newVectorizer.transform(valid2).toarray()

print(train3.shape)
print(valid3.shape)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

startTime = time.time()

modelLR = LogisticRegression(max_iter = 400, random_state = 0)
modelLR.fit(train3, train2_y.ravel())
prediction = modelLR.predict(valid3)

endTime = time.time()
print('Time passed: ' + str(endTime - startTime) + '\n')

print(classification_report(valid2_y, prediction))

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusionMatrix = confusion_matrix(valid2_y, prediction)
disp = ConfusionMatrixDisplay(confusion_matrix = confusionMatrix)
disp.plot()

plt.title('Logistic Regression')
plt.show()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

startTime = time.time()

desTreeModel = DecisionTreeClassifier(random_state=0)
desTreeModelFinal = desTreeModel.fit(train3, train2_y.ravel())
prediction2 = desTreeModelFinal.predict(valid3)

endTime = time.time()
print('Time passed: ' + str(endTime - startTime) + '\n')

print(classification_report(valid2_y, prediction2))

In [None]:
confusionMatrix2 = confusion_matrix(valid2_y, prediction2)
disp = ConfusionMatrixDisplay(confusion_matrix = confusionMatrix2)
disp.plot()

plt.title('Decision Tree')
plt.show()

In [None]:
print('First Example')
testList = preprocessTweet(datasetFilter['tweet'][180:181].values)
X_test = newVectorizer.transform(testList).toarray()

predictExample = modelLR.predict(X_test)
print('Predicted Label: ' + str(predictExample[0]))

actual = datasetFilter['class'][180:181].values
print('Actual Label: ' + str(actual[0]))


print('\nSecond Example')
testList2 = preprocessTweet(datasetFilter['tweet'][10:11].values)
X_test2 = newVectorizer.transform(testList2).toarray()

predictExample = modelLR.predict(X_test2)
print('Predicted Label: ' + str(predictExample[0]))

actual = datasetFilter['class'][10:11].values
print('Actual Label: ' + str(actual[0]))

In [None]:
while True:
  userInput = input("Enter the sentence you want to check: (If you want to quit, enter \'quit\')")
  if userInput == 'quit':
    print()
    break
  userInput = [userInput]

  userInputFinal = preprocessTweet(userInput)
  user_test = newVectorizer.transform(userInputFinal).toarray()

  user_predict = modelLR.predict(user_test)

  arr2 = userInput[0].split()

  a = ''

  if user_predict[0] == 2:
    for i in arr2:
      x = False
      with open('bad-words.csv', 'r') as wordColumn:
        csvReadVar = DictReader(wordColumn)
        for row in csvReadVar:
          if str(row['bad_word_column']) == (i):
            x = True
      
        if x == True:
          a = 'The sentence does not contain any hate speech or offensive word but, it contains profane word\n'
        elif x == False:
          a = 'The sentence does not contain any hate speech, offensive word or profane word\n'

  elif user_predict[0] == 1:
      for i in arr2:
        x = False
        with open('bad-words.csv', 'r') as wordColumn:
          csvReadVar = DictReader(wordColumn)
          for row in csvReadVar:
            if str(row['bad_word_column']) == (i):
              x = True
      
          if x == True:
            a = 'The sentence contains offensive word and profane word\n'
          elif x == False:
            a = 'The sentence contains offensive word but, does not contains profane word\n'

  elif user_predict[0] == 0:
        for i in arr2:
          x = False
          with open('bad-words.csv', 'r') as wordColumn:
            csvReadVar = DictReader(wordColumn)
            for row in csvReadVar:
              if str(row['bad_word_column']) == (i):
                x = True
      
          if x == True:
            a = 'The sentence contains hate speech and profane word\n'
          elif x == False:
            a = 'The sentence contains hate speech but, does not contains a profane word\n'

  print(a)