In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib
from joblib import dump, load

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
nltk.download('stopwords')
corpus = []
for i in range(len(train)):
    tweet = re.sub('[^a-zA-Z]', ' ', train['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    ps = PorterStemmer()
    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [None]:
lda = LDA(n_components=2, random_state=42)
lda.fit(X)

In [None]:
import pickle

# Saving the models
with open('cv.pkl', 'wb') as f:
    pickle.dump(cv, f)

with open('lda.pkl', 'wb') as f:
    pickle.dump(lda, f)

In [None]:
X_lda = lda.transform(X)
train['topic'] = np.argmax(X_lda, axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, train['target'], test_size=0.2, random_state=42)


In [None]:
xgb = XGBClassifier(n_estimators=200,learning_rate = 0.2,max_depth = 8)
xgb.fit(X_train, y_train)

In [None]:
with open('xgb.pkl', 'wb') as f:
    pickle.dump(xgb, f)
dump(xgb, 'xgb.joblib')

['xgb.joblib']

In [None]:
print(X_test)
y_pred = xgb.predict(X_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.81      0.70      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [None]:
corpus_test = []
for i in range(len(test)):
    tweet = re.sub('[^a-zA-Z]', ' ', test['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    ps = PorterStemmer()
    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus_test.append(tweet)

In [None]:
print(X_test)
X_test = cv.transform(corpus_test).toarray()

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
X_test_lda = lda.transform(X_test)
test['topic'] = np.argmax(X_test_lda, axis=1)
y_pred_test = xgb.predict(X_test)

In [None]:
test['target'] = y_pred_test
test[['id', 'target']].to_csv('submission.csv', index=False)