## Importing Dependencies

In [5]:
import nltk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
from string import punctuation
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# set plot style
sns.set(style = 'whitegrid')

## Loading Data And Overview

In [None]:
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
np.shape(train_df)

The data consist of 2 columns 'lang_id' and 'text'

In [None]:
list(train_df['lang_id'].unique())

We have 11 different languages. Now we're going to check how they are distributed 

In [None]:
ax = train_df['lang_id'].value_counts().plot(kind = 'bar', 
                                          title = 'Counts of Each Class of Sentiment',
                                          xlabel = 'sentiment', 
                                          ylabel = 'Count')
n_obs = [i.get_height() for i in ax.patches]
total = sum(n_obs)

for i in ax.patches:
    # get_x pulls left or right; get_height pushes up or down
    ax.text(i.get_x()+0.06, i.get_height()+0.5, \
            str(round((i.get_height()/total)*100, 1))+'%', fontsize=12,
                color='black')

The observations are balanced throughout all the classes

## Text Preprocessing

In [None]:
def text_cleaner(text):
    #let's removing punctuation
    res = text.apply(lambda x: ''.join(i for i in x if i not in punctuation))
    res = res.str.lower()
    return text

In [None]:
train_df['clean_text'] = text_cleaner(train_df['text'])
test_df['clean_text'] = text_cleaner(test_df['text'])
train_df.head()

## Feature Extraction

In [None]:
X=train_df['clean_text']
y=train_df['lang_id']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state =42)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,4))
lr1 =Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression(C=1000))
])

## Model Training

In [None]:
lr1.fit(X_train,y_train)
y_pred_lr1=lr1.predict(X_test) 

In [None]:
print('Model: Logistic Regression')
print(classification_report(y_test, y_pred_lr1))

In [None]:
test_pred= lr1.predict(test_df['clean_text'])
my_submission = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
vectorizer2 = TfidfVectorizer(ngram_range=(1,2),strip_accents='ascii')
lr2 =Pipeline([
    ('vectorizer', vectorizer2),
    ('clf', LogisticRegression(C=10))
])

In [None]:
lr2.fit(X_train,y_train)
y_pred_lr2=lr2.predict(X_test)


In [None]:
print('Model: Logistic Regression')
print(classification_report(y_test, y_pred_lr2))

In [None]:
test_pred2= lr2.predict(test_df['clean_text'])
my_submission2 = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred2})
# you could use any filename. We choose submission here
my_submission2.to_csv('submission2.csv', index=False)

In [None]:
vectorizer3 = CountVectorizer(ngram_range=(1,3),strip_accents='ascii')
svc1 =Pipeline([
    ('vectorizer', vectorizer3),
    ('clf', SVC(kernel='linear',C=10))
])

In [None]:
svc1.fit(X_train,y_train)
y_pred_svc1=svc1.predict(X_test)

In [None]:
print('Model: Logistic Regression')
print(classification_report(y_test, y_pred_svc1))

In [None]:
test_pred3= svc1.predict(test_df['clean_text'])
my_submission3 = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred3})
# you could use any filename. We choose submission here
my_submission3.to_csv('submission3.csv', index=False)

In [None]:
vectorizer4 = CountVectorizer(ngram_range=(1,3),strip_accents='ascii')
svc2 =Pipeline([
    ('vectorizer', vectorizer3),
    ('clf', SVC(kernel='rbf'))
])

In [None]:
svc2.fit(X_train,y_train)
y_pred_svc2=svc2.predict(X_test)

In [None]:
print('Model: Support Vector Machine')
print(classification_report(y_test, y_pred_svc2))

In [None]:
test_pred4= svc2.predict(test_df['clean_text'])
my_submission4 = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred})
# you could use any filename. We choose submission here
my_submission4.to_csv('submission4.csv', index=False)

In [None]:
vectorizer5 = CountVectorizer(ngr,strip_accents='ascii')
rf1=Pipeline([
    ('vectorizer', vectorizer5),
    ('clf',RandomForestClassifier())
])

In [None]:
rf1.fit(X_train,y_train)

In [None]:
y_pred_rf1=rf1.predict(X_test)
print('Model: Support Vector Machine')
print(classification_report(y_test, y_pred_rf1))

### Multinomial 

In [None]:
vectorizer7 = TfidfVectorizer(use_idf=True, strip_accents='ascii',max_df=0.95)
X_vector = vectorizer7.fit_transform(X_train)
X_test_vector=vectorizer7.transform(X_test)

In [None]:
nb1=MultinomialNB()
nb1.fit(X_vector.toarray(),y_train)

In [None]:
y_pred_nb1=nb1.predict(X_test_vector.todense())
print('Model: Support Vector Machine')
print(classification_report(y_test, y_pred_nb1))

In [None]:
test_pred7= nb1.predict(vectorizer7.transform(test_df['clean_text']).todense())
my_submission7 = pd.DataFrame({'index': test_df['index'], 'lang_id': test_pred7})
# you could use any filename. We choose submission here
my_submission7.to_csv('submission7.csv', index=False)