# READING DATA

In [1]:
#Imports
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
#read trainning dataset
dataset = "E:\\college\\level.4\\first term\\ST3\\Project\\offensiveTrainningDataset.tsv"
trainning_data = pd.read_csv(dataset, sep='\t', header=0)


In [3]:
#labeling dataset
tweet = trainning_data[["tweet"]]
subtask_a = trainning_data[["subtask_a"]]
subtask_b = trainning_data.query("subtask_a == 'OFF'")[["subtask_b"]]
subtask_c = trainning_data.query("subtask_b == 'TIN'")[["subtask_c"]]

In [4]:
print(len(tweet))
print(len(subtask_a))
print(len(subtask_b))
print(len(subtask_c))

13240
13240
4400
3876


In [5]:
import copy
preprocessdTweets = copy.deepcopy(tweet)

# preprocessing

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

def removePatterns(tweet):    
    patterns = ['URL', '@USER', '\'ve', 'n\'t', '\'s', '\'m']
    for pattern in patterns:
        tweet = tweet.replace(pattern, '')
    return re.sub(r'[^a-zA-Z]', ' ', tweet)

def removeEmoji(tweet):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',tweet)
    
def tweetTokenize(tweet):
    return word_tokenize(tweet.lower())

def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    processed_tokens = []
    for token in tokens:  
        if token not in stop_words:  
            processed_tokens.append(token)  
    return processed_tokens

def stem_and_lem(tokens):
    lancaster_stemmer = LancasterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for token in tokens:
        token = wordnet_lemmatizer.lemmatize(token)
        token = lancaster_stemmer.stem(token)
        if len(token) > 1:
            clean_tokens.append(token)
    return clean_tokens

##def lemmatizeTokens(tokens):
##    lemmatizer = WordNetLemmatizer() 
##    processed_tokens = []
##    for token in tokens:
##        token = lemmatizer.lemmatize(token)
##        if len(token)>1:
##            processed_tokens.append(token)
        
##def stemmingTokens(tokens):
##    processed_tokens = []
##    ps = PorterStemmer()
##    for token in tokens:
##        token = ps.stem(token)
##        if len(token)>1:
##            processed_tokens.append(token)
        

In [7]:
preprocessdTweets['tweet'] = tweet['tweet'].apply(removePatterns)

preprocessdTweets['tokens'] = preprocessdTweets['tweet'].apply(removeEmoji)

preprocessdTweets['tokens'] = preprocessdTweets['tweet'].apply(tweetTokenize)

preprocessdTweets['tokens'] = preprocessdTweets['tokens'].apply(remove_stop_words)

preprocessdTweets['tokens'] = preprocessdTweets['tokens'].apply(stem_and_lem)

##tqdm.pandas(desc="Stemming")
##preprocessdTweets['tokens'] = preprocessdTweets['tokens'].progress_apply(stemmingTokens)
 
text_vector = preprocessdTweets['tokens'].tolist()

In [8]:

tqdm.pandas(desc="Stemming and Lemmatizing")
preprocessdTweets['tokens'] = preprocessdTweets['tokens'].apply(stem_and_lem)

text_vector = preprocessdTweets['tokens'].tolist()

  from pandas import Panel


In [10]:
print("this is an example before preprocessing")
print(tweet[:1])

print("\n this is an example after preprocessing")
print(preprocessdTweets[:1])


this is an example before preprocessing
                                               tweet
0  @USER She should ask a few native Americans wh...

 this is an example after preprocessing
                                               tweet               tokens
0   She should ask a few native Americans what th...  [ask, nat, am, tak]


# Vectorization

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfid(text_vector):
    vectorizer = TfidfVectorizer()
    untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
    vectorizer = vectorizer.fit(untokenized_data)
    vectors = vectorizer.transform(untokenized_data).toarray()
    return vectors

def get_vectors(vectors, labels, keyword):
    if len(vectors) != len(labels):
        print("Unmatching sizes!")
        return
    result = list()
    for vector, label in zip(vectors, labels):
        if label == keyword:
            result.append(vector)
    return result



In [12]:
vectors_a = tfid(text_vector) # Numerical Vectors A
labels_a = subtask_a['subtask_a'].values.tolist() # Subtask A Labels

vectors_b = get_vectors(vectors_a, labels_a, "OFF") # Numerical Vectors B
labels_b = subtask_b['subtask_b'].values.tolist() # Subtask B Labels

vectors_c = get_vectors(vectors_b, labels_b, "TIN") # Numerical Vectors C
labels_c = subtask_c['subtask_c'].values.tolist() # Subtask C Labels

Vectorizing...: 100%|██████████| 13240/13240 [00:00<00:00, 156862.40it/s]


# Classifing and Modeling

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

def classify(vectors, labels):
    # Random Splitting With Ratio 3 : 1
    train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.333)

    # Initialize Model
    classifier = None
    #using LR model
    classifier = LogisticRegression(multi_class='auto', solver='newton-cg',)
    classifier = GridSearchCV(classifier, {"C":np.logspace(-3,3,7), "penalty":["l2"]}, cv=3, n_jobs=4)
    classifier.fit(train_vectors, train_labels)
    classifier = classifier.best_estimator_

    accuracy = accuracy_score(train_labels, classifier.predict(train_vectors))
    print("Training Accuracy:", accuracy)
    test_predictions = classifier.predict(test_vectors)
    accuracy = accuracy_score(test_labels, test_predictions)
    print("Test Accuracy:", accuracy)

In [14]:
#Bulding Models A, B, C using Logestic Regrision

In [15]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[:13240], labels_a[:13240])


Building Model Subtask A...
Training Accuracy: 0.8067036575699241
Test Accuracy: 0.7530052166024042


In [16]:
print("\nBuilding Model Subtask B...")
classify(vectors_b[:], labels_b[:])


Building Model Subtask B...
Training Accuracy: 0.8803680981595092
Test Accuracy: 0.8819918144611187


In [17]:
print("\nBuilding Model Subtask C...")
classify(vectors_c[:], labels_c[:])


Building Model Subtask C...
Training Accuracy: 0.797678916827853
Test Accuracy: 0.675445391169636
