In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer #for noise removal 
from sklearn.feature_extraction.text import TfidfVectorizer
import re #Regex Library
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
lemmatizerObject = WordNetLemmatizer()
labels=[]
tweets=[]
def remove_noise(input_text):
    noise_list = ["a", "about", "after", "all", "also", "an", "another", "any", "and", "are", "as", "and", "at", "be", "because", "been", "before", "being", "between", "but", "both", "by", "came", "can", "come", "could ", "did", "do", "each", "even", "for", "from", "further", "furthermore", "get", "got", "has", "had", "he", "have", "her", "here", "him", "himself", "his", "how", "hi", "however","i", "if", "in", "into", "is", "it", "its", "indeed", "just", "like", "made", "many", "me", "might", "more", "moreover", "most", "much", "must", "my never", "not", "now of", "on", "only", "other", "our", "out", "or", "over", "said", "same", "see", "should", "since", "she", "some", "still", "such", "take", "than", "that", "the", "their", "them", "then", "there", "these", "therefore", "they", "this", "those", "through", "to", "too", "thus", "under", "up", "was", "way", "we", "well", "were", "what", "when", "where", "which", "while", "who", "will", "with", "would", "your", "null"]
    words = input_text.split() # Split words by space
    noise_free_words = [word for word in words if word.lower() not in noise_list] #Get a list of non-noise words
    noise_free_text = " ".join(noise_free_words) #Get a string of non-noise words
    return noise_free_text

In [3]:
def remove_regex(input_text):
    #split tweet by space
    words = input_text.split() 
    regex_free_text=""
    #check if word is alpha(contain letters only) , then add it to regex_free_text
    for word in words:
      if word.isalpha():
        #Lemmatization, on the other hand, is an organized & step by step procedure of obtaining
        #the root form of the word, it makes use of vocabulary (dictionary importance of words) 
        #and morphological analysis (word structure and grammar relations).
        #reduces the inflected words properly ensuring that the root word belongs to the language
        #pos="V"-->to give a root for each word !
        regex_free_text += lemmatizerObject.lemmatize(word,pos="v")
        regex_free_text +=" "
    return regex_free_text
        

In [4]:
def readFile(filename):
    my_file = open(filename, encoding="utf-8")
    #return value of csv file is an iterator 
    read = csv.reader(my_file,delimiter='\t')
    #splitting = read.split('\t')
    flag=0;
    #flag ---> used to skip the header of the file
    #column one for tweets , column 2 for our ouput(NOT or OFF)
    for row in read:
      if flag ==0:
        flag=1
        continue;
      tweets.append(row[1])
      labels.append(row[2])


In [5]:
def extract_features(ourTweets,ourTestTweets):   
   vectorizer = TfidfVectorizer()
   # return value ---> position of the word , index of tweet , tfidf value of the word.
   X = vectorizer.fit_transform(ourTweets)
   Y = vectorizer.transform(ourTestTweets)
   return X,Y;

In [6]:
def SVMClassifier(featuresTrain,labelsTrain,featuresTest):
    #bn3ml training 3la train data (bn build our method 3aleha )
    #f b3ml object mn classifier bt3i w b3den fit de bt3ml train lal data bt3ty
    #tol---> nesbt el error el masbo7 beha el lw wsl 3ndha aw 2al yw2f w my7rksh el separator 
    #random_state is the seed used by the random number generator
    #linear SVC da shbh precepton 
    clf = LinearSVC(random_state=0, tol=1e-5)
    clf.fit(featuresTrain, labelsTrain)
    #b predict b2a 3la test data bt3ty 3shn agib accuracy bt3t classifier da
    X=clf.predict(featuresTest)
    return X;


In [7]:
def RandomForest_Classifier(featuresTrain,labelsTrain,featuresTest):
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(featuresTrain, labelsTrain)
        
    return clf.predict(featuresTest)

In [8]:
def LogisticalRegression_Classifier(featuresTrain,labelsTrain,featuresTest):
    clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
    clf.fit(featuresTrain, labelsTrain)
    
    return clf.predict(featuresTest)

In [9]:
def NaiveBayesClassifier(featuresTrain,labelsTrain,featuresTest):
    clf = MultinomialNB()
    clf.fit(featuresTrain, labelsTrain)
    
    return clf.predict(featuresTest)

In [10]:
def KNNClassifier(featuresTrain,labelsTrain,featuresTest):
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(featuresTrain, labelsTrain)
    
    return clf.predict(featuresTest)

In [11]:
def getAccuracy(outputLabels,actualLabels):
      return accuracy_score(outputLabels, actualLabels)
def calculateConfusionMatrix(outputLabels,actualLabels):
    CM = confusion_matrix(actualLabels, outputLabels)
    print(CM)
readFile("offenseval-training-v1.tsv")
#loop for each tweet remove regex & noise
for tweet in range(0,len(tweets)):
    tweets[tweet]=remove_regex(tweets[tweet])
    tweets[tweet]=remove_noise(tweets[tweet])
#we have to divide our dataset into 2 parts (training data and test data)
lenofLabel=(int)(len(labels)/2)
lenofTweets=(int)(len(tweets)/2)
train_labels=labels[:lenofLabel]
test_labels=labels[lenofLabel:]
train_tweets=tweets[:lenofLabel]
test_tweets=tweets[lenofLabel:]
#to extract Features
features_train,features_test=extract_features(train_tweets,test_tweets)

SVMpredictLabels = SVMClassifier(features_train,train_labels,features_test)
RFpredictLabels = RandomForest_Classifier(features_train,train_labels,features_test)
LRpredictLabels = LogisticalRegression_Classifier(features_train,train_labels,features_test)
NBpredictLabels=NaiveBayesClassifier(features_train,train_labels,features_test)
KNNpredictLabels=KNNClassifier(features_train,train_labels,features_test)
#calculate accuracy of predicted labels (from our algorithm ) & the actual labels
SVMaccuracy=getAccuracy(SVMpredictLabels,test_labels)
RFaccuracy=getAccuracy(RFpredictLabels,test_labels)
LRaccuracy=getAccuracy(LRpredictLabels,test_labels)
NBaccuracy=getAccuracy(NBpredictLabels,test_labels)
KNNaccuracy=getAccuracy(KNNpredictLabels,test_labels)

print("accuracy of svm calssifier:")
print(SVMaccuracy)
print("\n")

print("accuracy of Random Forrest calssifier:")
print(RFaccuracy)
print("\n")

print("accuracy of Logistic Regression calssifier:")
print(LRaccuracy)
print("\n")

print("accuracy of Naive Bayes calssifier:")
print(NBaccuracy)
print("\n")

print("accuracy of KNN calssifier:")
print(KNNaccuracy)
print("\n")

print("\n")
print("Confusion Matrix of SVM calssifier:\n")
calculateConfusionMatrix(SVMpredictLabels,test_labels)

print("\n")
print("Confusion Matrix of Random Forrest calssifier:\n")
calculateConfusionMatrix(RFpredictLabels,test_labels)

print("\n")
print("Confusion Matrix of Logistical Regression calssifier:")
calculateConfusionMatrix(LRpredictLabels,test_labels)

print("\n")
print("Confusion Matrix of Naive Bayes calssifier:\n")
calculateConfusionMatrix(NBpredictLabels,test_labels)

print("\n")
print("Confusion Matrix of KNN calssifier:\n")
calculateConfusionMatrix(KNNpredictLabels,test_labels)

accuracy of svm calssifier:
0.7253776435045317


accuracy of Random Forrest calssifier:
0.7398791540785499


accuracy of Logistic Regression calssifier:
0.738821752265861


accuracy of Naive Bayes calssifier:
0.7078549848942598


accuracy of KNN calssifier:
0.682477341389728




Confusion Matrix of SVM calssifier:

[[3804  618]
 [1200  998]]


Confusion Matrix of Random Forrest calssifier:

[[4134  288]
 [1434  764]]


Confusion Matrix of Logistical Regression calssifier:
[[4112  310]
 [1419  779]]


Confusion Matrix of Naive Bayes calssifier:

[[4363   59]
 [1875  323]]


Confusion Matrix of KNN calssifier:

[[4359   63]
 [2039  159]]
