In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [16]:
# Load the data
df = pd.read_csv("train.csv")
df = df.dropna()
X = df["text"]
Y = df["target"]

In [17]:
# Preprocessing
stop_words = set(stopwords.words("english"))
porter = nltk.PorterStemmer()

def preprocess_text(text):
    # tokenize the text
    tokenized = word_tokenize(text.lower())
    # remove stop words
    filtered = [token for token in tokenized if token not in stop_words]
    # perform stemming
    stemmed = [porter.stem(token) for token in filtered]
    return " ".join(stemmed)

X = X.apply(preprocess_text)

In [18]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [19]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# Train the model
svm = SVC(kernel='linear')
svm.fit(X_train, Y_train)

SVC(kernel='linear')

In [20]:
# Evaluate the model
y_pred = svm.predict(X_test)
print("The accuracy score for the SVM model is: ", accuracy_score(Y_test, y_pred))
print("The F1 score for the SVM model is: ", f1_score(Y_test, y_pred))

The accuracy score for the SVM model is:  0.8149606299212598
The F1 score for the SVM model is:  0.763819095477387


In [21]:
# now apply the preprocesssing on the test data set
#Loading the Tst data
df_test = pd.read_csv("D:\\SensViz\\Machine Learning Task 3\\test.csv")

df_test_text = df_test['text'].apply(preprocess_text)

#vectorized the data
df_test_text = vectorizer.transform(df_test_text)
svm_pred = svm.predict(df_test_text)
print(svm_pred)

[1 1 1 ... 1 1 0]


In [22]:
# create the submission file 
submission = pd.DataFrame({'id': df_test['id'], 'target': svm_pred})
submission.to_csv('submission.csv', index=False)

In [23]:
#read the submission file
sub_file = pd.read_csv("submission.csv")
sub_file.head(50)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
