In [1]:
# utilities
import re
import numpy as np
import pandas as pd
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# nltk for natural language processing
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

## Reading sentiment data in a dataframe

In [3]:
df = pd.read_csv('../data/Tamil_first_ready_for_sentiment.csv',sep='\t',header=None)
df.columns=["target","text"]
df

Unnamed: 0,target,text
0,Negative,Enna da ellam avan seyal Mari iruku
1,Negative,This movei is just like ellam avan seyal
2,Positive,Padam vanthathum 13k dislike pottavaga yellam...
3,Positive,Neraya neraya neraya... ... V era level...thala
4,Positive,wow thavala sema mass....padam oru pundaikum ...
...,...,...
15739,Mixed_feelings,ivaru cinemala laam nalla tha prasuraaru...aa...
15740,Positive,Pattaya Kilaputhupaa trailer... !!!!! Get Raj...
15741,Mixed_feelings,En innum trending la varala? Ennada panringa ...
15742,not-Tamil,Rajnikant sir plz aap india ke pm ban jaao


In [None]:
print('length of data is', len(df))

In [None]:
df.info()

In [None]:
df.dtypes

## chek for null values

In [None]:
np.sum(df.isnull().any(axis=1))

## Number of target or classes in the dataset

In [None]:
df['target'].unique()

## Distribution of target labels in the dataset

In [None]:
import seaborn as sns
sns.countplot(x='target', data=df)

# Preprocessing the text data before feeding it to ML model

## Removing punctuations

In [None]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['text']= df['text'].apply(lambda x: cleaning_punctuations(x))
df['text'].tail()

## Removing any numbers from the text

In [None]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
df['text'] = df['text'].apply(lambda x: cleaning_numbers(x))
df['text'].tail()

## convert tamil text to English

In [None]:
from googletrans import Translator
translator = Translator()

## convertTamiltoEnglish utility function to translate tamil tweets to English

In [None]:
import time
def convertTamiltoEnglish(tm):
    #time.sleep(0.2)
    return translator.translate(tm, src='ta', dest='en').text

## using final result of the transaltion

In [None]:
final_df=pd.read_csv("data/tamiltoEnglish.csv",sep=",",header=0)
final_df

## Making statement text in lower case

In [None]:
final_df['english']=final_df['english'].str.lower()
final_df['english'].tail()

In [None]:
## Stopwrods are generally not useful for the task. Hence, they need to be removed, ex: a, the, it...

In [None]:
from nltk.corpus import stopwords
#nltk.download('stopwords')
stopwordlist=stopwords.words('english')
stopwordlist

## Removing stopwrods from the data

In [None]:
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
final_df['english'] = final_df['english'].apply(lambda text: cleaning_stopwords(text))
final_df['english'].head()

## Removing punctuations

In [None]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
final_df['english']= final_df['english'].apply(lambda x: cleaning_punctuations(x))
final_df['english'].tail()

## Removing any numbers from the text

In [None]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
final_df['english'] = final_df['english'].apply(lambda x: cleaning_numbers(x))
final_df['english'].tail()

## tokenization of tweet text

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
final_df['english']=final_df['english'].apply(tokenizer.tokenize)
final_df['english'].head()

In [None]:
final_df

## Applying Stemming: Reducing a word to its stem

In [None]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
final_df['english']= final_df['english'].apply(lambda x: stemming_on_text(x))
final_df['english'].head()

## Lemmatization considers the context and converts the word to its meaningful base form

In [None]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
final_df['english'] = final_df['english'].apply(lambda x: lemmatizer_on_text(x))
final_df['english'].head()

In [None]:
final_df=final_df.reset_index(drop=True)
final_df

In [None]:
df_concat = pd.concat([df,final_df],axis=1)
df_concat=df_concat[["target","tamil","english"]]
df_concat

In [None]:
df_pos=df_concat.loc[df_concat["target"]=="Positive "]
df_pos

In [None]:
data_pos = df_concat['english']
wc = WordCloud(max_words = 500 , width = 1600 , height = 800,
              collocations=False).generate("".join(data_pos.astype(str).replace("'","")))
plt.figure(figsize = (20,20))
plt.imshow(wc)

In [None]:
X=df_concat.english
y=df_concat.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state =12312)

In [None]:
X_train=X_train.astype(str)
X_train

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

In [None]:
X_train = vectoriser.transform(X_train)
#X_test  = vectoriser.transform(X_test.astype(str))

In [None]:
X_test=X_test.astype(str)
X_test

In [None]:
X_test  = vectoriser.transform(X_test.astype(str))

In [None]:
def model_Evaluate(model):
# Predict values for Test dataset
    y_pred = model.predict(X_test)
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))

##  target dependent variable have more than two classes i.e. multiclass

In [None]:
LRmodel = LogisticRegression(multi_class='multinomial',  max_iter = 2000,solver='lbfgs', penalty='l2', C=1.0)
LRmodel.fit(X_train, y_train)
model_Evaluate(LRmodel)