In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix

import warnings
warnings.filterwarnings('ignore')

### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [2]:
# Code starts here
data = pd.read_csv("train.csv")

In [3]:
# stopwords 
stop = set(stopwords.words('english'))

# retain only alphabets
data['TITLE'] = data['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
data['TITLE'] = data['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
data['TITLE'] = data['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
data['TITLE'] = data['TITLE'].apply(lambda x: ' '.join(x))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data["TITLE"],data["CATEGORY"], test_size = 0.3, random_state = 42)

# initialize count vectorizer
cv = CountVectorizer()

# initialize tfidf vectorizer
tv = TfidfVectorizer(ngram_range=(1,3))

# fit and transform with count vectorizer
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# fit and transform with tfidf vectorizer
X_train_tfidf = tv.fit_transform(X_train)
X_test_tfidf = tv.transform(X_test)

print(X_train_cv.shape, y_train.shape)
print(X_test_tfidf.shape, y_test.shape)


print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(236554, 40795) (236554,)
(101381, 1501240) (101381,)
(236554,) (236554,)
(101381,) (101381,)


In [8]:
# initialize multinomial naive bayes
nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

# fit on count vectorizer training data
nb_1.fit(X_train_cv, y_train)

# fit on tfidf vectorizer training data
nb_2.fit(X_train_tfidf, y_train)

# accuracy with count vectorizer
acc_count_nb = accuracy_score(nb_1.predict(X_test_cv), y_test)

# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), y_test)

# display accuracies
print('acc_count_nb:',acc_count_nb, '\nacc_tfidf_nb:', acc_tfidf_nb)

acc_count_nb: 0.9253903591402728 
acc_tfidf_nb: 0.9266036042256438


In [9]:
# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=42))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=42))

# fit on count vectorizer training data
logreg_1.fit(X_train_cv, y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_cv), y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), y_test)

# display accuracies
print('\n\nacc_count_logreg=',acc_count_logreg, '\nacc_tfidf_logreg=',acc_tfidf_logreg)



acc_count_logreg= 0.9449502372239375 
acc_tfidf_logreg= 0.9391306063266293


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [17]:
# Read the test data
test = pd.read_csv('test.csv')
test.head(10)


# Storing the id from the test file
id_ = test['Id']

# Apply the transformations on test
# retain only alphabets
test['TITLE'] = test['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
test['TITLE'] = test['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
test['TITLE'] = test['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
test['TITLE'] = test['TITLE'].apply(lambda x: ' '.join(x))

test_cv = cv.transform(test['TITLE'])
test_tfidf = tv.transform(test['TITLE'])

# Predict on the test data
y_pred_test = logreg_1.predict(test_cv)
print(test_cv.shape)


(84484, 40795)


In [16]:
#y_pred_test = y_pred_test.flatten()
#y_pred_test

array(['m', 't', 'm', ..., 'e', 't', 'e'], dtype='<U1')

In [28]:
# Create a sample submission file
sample_submission = pd.DataFrame({'':id_, 'CATEGORY':y_pred_test})
print(sample_submission.head(10))

# Convert the sample submission file into a csv file
sample_submission.to_csv('sample_submission.csv',index=False, header=True)

# Code ends here

          CATEGORY
0   86998        m
1  112926        t
2  280943        m
3   37154        m
4  152800        t
5  412956        t
6  197094        b
7  117620        t
8   14854        e
9  342032        e
