In [12]:
import os
import pandas as pd
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Downloading the required wordnet data
nltk.download('punkt')
from langdetect import detect

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])
def tokenize_text(text):
    return word_tokenize(text)

In [14]:
def preprocess(df):
    # Convert 'combined_text' column to strings
    df['combined_text'] = df['combined_text'].astype(str)
    
    # Lowercase
    df['combined_text'] = df['combined_text'].str.lower()

    # Expanding Contractions
    df['combined_text'] = df['combined_text'].apply(lambda x: contractions.fix(x))

    # Removing the URLs
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

    # Removing the HTML tags
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Removing the numbers
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing the extra whitespaces
    df['combined_text'] = df['combined_text'].apply(lambda x: x.strip())

    # Removing the punctuation
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Removing the stopwords
    df['combined_text'] = df['combined_text'].apply(remove_stopwords)

    # Tokenization
    df['combined_text'] = df['combined_text'].apply(tokenize_text)

    return df

# SVM

In [15]:
import os
import pandas as pd
import re
import contractions
from langdetect import detect

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [17]:
train = pd.read_csv('..\\..\\Dataset_Cleaned\\ISOT\\clean_train_isot.csv')

In [18]:
train = preprocess(train)

In [19]:
X_train = train['combined_text']
y_train = train['label']

In [20]:
X_train.head()

0    [conservative, terrorist, given, insanely, lig...
1    [clinton, ad, slam, trump, disgusting, insult,...
2    [hardliner, protest, french, labor, reform, ma...
3    [nba, kowtow, racist, order, player, stand, an...
4    [ben, carson, praise, time, trump, compared, c...
Name: combined_text, dtype: object

In [21]:
X_train = pd.Series(X_train)
X_train = X_train.astype(str)

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC()),
])

# Define the grid parameters
params = [
    {
        'clf': [SVC()],
        'clf__C': [10],  # Regularization parameter
        'clf__kernel': ['rbf'],  # Kernel type
        'clf__gamma': [0.1]  # Kernel coefficient
    }
]

# Create a StratifiedKFold object
stratified_kfold = StratifiedKFold(n_splits=10)

# Create the grid search object
grid_search = GridSearchCV(pipeline, params, cv=stratified_kfold, n_jobs=-1, refit=True, verbose=2)

# Fit the grid search object to the data to compute the optimal model
grid_search.fit(X_train, y_train)

# Print the best score and the best parameters
print("Best Score: ", grid_search.best_score_)
print("Best Parameters: ", grid_search.best_params_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best Score:  0.9931331519777908
Best Parameters:  {'clf': SVC(), 'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [24]:
clf = grid_search.best_estimator_

In [25]:
# Training performance
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16933
           1       1.00      1.00      1.00     13940

    accuracy                           1.00     30873
   macro avg       1.00      1.00      1.00     30873
weighted avg       1.00      1.00      1.00     30873



# Submit test prediction

In [26]:
test = pd.read_csv('..\\..\\Dataset_Cleaned\\ISOT\\clean_test_isot.csv')

In [27]:
test = preprocess(test)

In [28]:
test = test.fillna(' ')
X_test = test['combined_text'].apply(' '.join)
test
y_test = test['label']
y_test_pred = clf.predict(X_test)

In [29]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4258
           1       0.99      0.99      0.99      3461

    accuracy                           0.99      7719
   macro avg       0.99      0.99      0.99      7719
weighted avg       0.99      0.99      0.99      7719



In [31]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_test_pred))

[[4235   23]
 [  45 3416]]
