The data contains reviews about places along with their ratings ranging from 1 to 5

## Before working on NLP, download the required packages

In [1]:
# !pip install pandas
# !pip install nltk

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Use the labelled data data for pre-processing

In [5]:
df = pd.read_csv("labeled_data.csv")
df.head(1)

Unnamed: 0,text,label
0,The new rule is - \r\nif you are waiting for a...,4


Remove stopwords from the data

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\NAYANSHREE
[nltk_data]     SINHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Word lemmatizer along with removing .5% and .95% threshold values

In [4]:
text = df.text.tolist()

# get all words with doc feq
all_words = {}
lemmatizer = WordNetLemmatizer()
for each in text:
    c = 0
    for e in each.split(" "):
        e = lemmatizer.lemmatize(e,pos="v")
        if e not in all_words:
            all_words[e] = 1
        else:
            all_words[e] += 1

# now got the document frequecy of word. lets check for 5% and 95%
five_percent = 0.05*len(df)
ninty_five_percent = 0.95*len(df)

all_words_1 = all_words.copy()

for k,v in all_words.items():
    if all_words[k] < five_percent or all_words[k] > ninty_five_percent:
        del all_words_1[k]

Using a clean_text function to remove all special characters and stopwords

In [5]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z ]')
STOPWORDS = set(stopwords.words('english'))
badwords = set(all_words.keys())

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if ((word not in STOPWORDS) and (word in badwords))) # delete stopwors from tex
        
    return text

df["text"] = df['text'].apply(clean_text)

Tokenize the words

In [6]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
   
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [7]:
y = df['label']
X = df['text']

Split train and test data to predict the labels

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state=42)

In [10]:
## Best till now
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

lr_pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2),
                             token_pattern=r'\b\w+\b',
                             tokenizer=LemmaTokenizer())),
    ('tfidf', TfidfTransformer())
    
])


In [None]:
lr_1 = lr_pipeline.fit(X_train,y_train)

lr_pred = lr_1.predict(X_test)
accuracy_score(y_test,lr_pred)


In [30]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [33]:
from sklearn.pipeline import Pipeline

lr_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
])

lr = lr_pipe.fit(X_train, y_train)
y_test = lr_pipe.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.34


In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

lr_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(solver='saga',max_iter=200,multi_class='multinomial'))
])

lr = lr_pipe.fit(X_train, y_train)

In [17]:
y_pred = lr.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5641


In [28]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5428


In [28]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


lr_pipe = LinearDiscriminantAnalysis()
lr_pipe.fit(X_train, y_train)

y_pred = lr_pipe.predict(X_test)



In [29]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.3


In [30]:
from sklearn.naive_bayes import MultinomialNB
lr_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))
])
clf = lr_pipe.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5258666666666667


In [35]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.90


twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [36]:
from xgboost import XGBClassifier


In [39]:
lr_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', XGBClassifier(max_depth=5, num_class=5))
])
clf = lr_pipe.fit(X_train, y_train)

y_pred = clf.predict(X_test)

  if diff:


In [40]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5002
