In [47]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deeptweets/sample_submission.csv
/kaggle/input/deeptweets/training.csv
/kaggle/input/deeptweets/train.csv
/kaggle/input/deeptweets/test.csv


In [48]:
import pandas as pd

train =  pd.read_csv("/kaggle/input/deeptweets/train.csv")
test =  pd.read_csv("/kaggle/input/deeptweets/test.csv")

In [49]:
train.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


In [50]:
test.head()

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...


In [51]:
train["Label"].value_counts()

Sports      3325
Politics    3200
Name: Label, dtype: int64

In [52]:
 # deleting twitId from train data
train=train.drop(['TweetId'],axis=1)
train.head()

Unnamed: 0,Label,TweetText
0,Politics,'#SecKerry: The value of the @StateDept and @U...
1,Politics,'@rraina1481 I fear so'
2,Sports,'Watch video highlights of the #wwc13 final be...
3,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,Sports,'@cricketfox Always a good thing. Thanks for t...


* ***Text preprocessing***

In [53]:
import re
import spacy
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet', quiet=True)
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from string import punctuation
import collections
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import en_core_web_sm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

In [54]:
# removing the hashtags, mentions and unwanted characters.
def clean_text(db, text):
    db[text] = db[text].str.lower()
    db[text] = db[text].apply(lambda elem: re.sub(r"(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|(\w+:\/\/\S+)|^rt|[^A-Za-z0-9\s]", "", elem)) 
    db = db[db[text].str.strip() != ""]  # Remove empty tweets after cleaning    
    return db

In [55]:
train = clean_text(train, 'TweetText')
train.head()

Unnamed: 0,Label,TweetText
0,Politics,the value of the and is measured not in dol...
1,Politics,i fear so
2,Sports,watch video highlights of the final between a...
3,Sports,rt at nitro circus at
4,Sports,always a good thing thanks for the feedback


In [56]:
# this section is for :
#  tokenize the input text into individual words
# removeing any stopwords from it(a,the...)
#  Lemmatization: converts words 
# to their base or root form. For example, 'running' becomes 'run'
# converting it to lowercase

# Load the spaCy English language model
nlp = en_core_web_sm.load()

# Initialize the stopwords
stop = set(stopwords.words('english'))

# function for text processing
def cleanText(text):
    final_text = []
    w_tokenizer = WhitespaceTokenizer()
    doc = nlp(text)
    for token in doc:
        if token.text.lower() not in stop and token.text not in string.punctuation:
            final_text.append(token.lemma_.lower())
    return " ".join(final_text)

In [57]:
train['TweetText'] = train['TweetText'].apply(cleanText)
train.head(10)

Unnamed: 0,Label,TweetText
0,Politics,value measure dollar term deep american ...
1,Politics,fear
2,Sports,watch video highlight final australia west i...
3,Sports,rt nitro circus
4,Sports,always good thing thank feedback
5,Politics,dr rajan fiscal consolidation create space mon...
6,Politics,fact 800000 defense employee force take unpaid...
7,Sports,1st test 39 0 run 1 wkt wade 0 clarke 24 101...
8,Sports,africas top team try take step close week ch...
9,Sports,beat tweet zebras stop play unusual reason c...


In [58]:
# Function to convert text to a list of words
def extract_words(text):
    return text.split()

# Add the new column
train['TweetsList'] = train['TweetText'].apply(extract_words)

# Display the DataFrame with the new column
train.head()

Unnamed: 0,Label,TweetText,TweetsList
0,Politics,value measure dollar term deep american ...,"[value, measure, dollar, term, deep, american,..."
1,Politics,fear,[fear]
2,Sports,watch video highlight final australia west i...,"[watch, video, highlight, final, australia, we..."
3,Sports,rt nitro circus,"[rt, nitro, circus]"
4,Sports,always good thing thank feedback,"[always, good, thing, thank, feedback]"


* ***Feature Extraction***

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Data Preparation
X_train = train['TweetText']
y_train = train['Label']


# Vectorization
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)

* ***Model Training and Hyper-parameter tuning***

In [60]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_train_vectorized, y_train, test_size=0.3, random_state=42)

In [61]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV


# Define the parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 'scale']
}

# Initialize and train  classifiers 

svm_classifier = SVC()


#  grid search for SVM
svm_grid_search = GridSearchCV(svm_classifier, svm_param_grid, cv=5, n_jobs=-1)
svm_grid_search.fit(X_train, y_train)

# Get the best SVM classifier
best_svm_classifier = svm_grid_search.best_estimator_
print(' best hyperparameters for svm',svm_grid_search.best_params_)



 best hyperparameters for svm {'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [62]:
# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_classifier = RandomForestClassifier()

# Perform grid search for Random Forest
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Get the best Random Forest classifier
best_rf_classifier = rf_grid_search.best_estimator_

In [63]:
# Initialize Logistic Regression classifier
logreg_classifier = LogisticRegression()

logreg_classifier.fit(X_train, y_train)

In [64]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

* ***Predicting***

In [65]:
#  making predictions on the testing data
svm_predictions = best_svm_classifier.predict(X_test)
rf_predictions = best_rf_classifier.predict(X_test)
logreg_predictions = logreg_classifier.predict(X_test)
nb_predictions = nb_classifier.predict(X_test)

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Calculate evaluation metrics
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')
svm_f1_score = f1_score(y_test, svm_predictions, average='weighted')
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')
rf_f1_score = f1_score(y_test, rf_predictions, average='weighted')
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)

logreg_accuracy = accuracy_score(y_test, logreg_predictions)
logreg_precision = precision_score(y_test, logreg_predictions, average='weighted')
logreg_recall = recall_score(y_test, logreg_predictions, average='weighted')
logreg_f1_score = f1_score(y_test, logreg_predictions, average='weighted')
logreg_conf_matrix = confusion_matrix(y_test, logreg_predictions)

nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')
nb_f1_score = f1_score(y_test, nb_predictions, average='weighted')
nb_conf_matrix = confusion_matrix(y_test, nb_predictions)


# dictionary to hold the results
results = {
    'Classifier': ['SVM', 'Random Forest','Logistic Regression','Naive Bayes'],
    'Accuracy': [svm_accuracy, rf_accuracy,logreg_accuracy,nb_accuracy],
    'Precision': [svm_precision, rf_precision,logreg_precision,nb_precision],
    'Recall': [svm_recall, rf_recall,logreg_recall,nb_recall],
    'F1-score': [svm_f1_score, rf_f1_score,logreg_f1_score,nb_f1_score],
    'Confusion Matrix': [svm_conf_matrix, rf_conf_matrix,logreg_conf_matrix,nb_conf_matrix]
}

# Convert the dictionary to a DataFrame
results = pd.DataFrame(results)

# Print the DataFrame
print(results)

            Classifier  Accuracy  Precision    Recall  F1-score  \
0                  SVM  0.925641   0.925633  0.925641  0.925635   
1        Random Forest  0.883590   0.884646  0.883590  0.883301   
2  Logistic Regression  0.918974   0.919041  0.918974  0.918929   
3          Naive Bayes  0.926154   0.926207  0.926154  0.926118   

          Confusion Matrix  
0   [[851, 74], [71, 954]]  
1  [[782, 143], [84, 941]]  
2   [[837, 88], [70, 955]]  
3   [[845, 80], [64, 961]]  


In [68]:
submission=test.copy(deep=True)
submission=submission.drop(["TweetText"],axis=1)

In [70]:
submission.to_csv('submission.csv',index=False)

***In my effort to enhance the existing solution, I propose that for enhancing the model's performance, we allocate additional time to refine the data cleaning process while also considering the reduction of features through techniques such as PCA or Univariate feature selection.***

***Authored by MARYAM KHALLALA
Student in IT Engineering***