In [1]:
import pandas as pd
import re
import string
import time
import numpy as np
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [2]:
stopwords = open('nepali_stopwords.txt','r',encoding="utf-8").read().splitlines()
negWords = open('neg.csv','r',encoding="utf-8").read().splitlines()
posWords = open('positive_words.txt','r',encoding="utf-8").read().splitlines()
data = pd.read_csv("headlines.txt",sep="#",header=None)
data.columns=["body_text","label"]
#negData = data.loc[data['label']=='neg']
#data = data.append(negData,ignore_index=True)
data = shuffle(data)
data.loc[data["label"].isnull()]
data.loc[data['label']=="pos "]
data["label"].unique()
#data.shape
negData = data.loc[data['label']=='neg ']
negData


Unnamed: 0,body_text,label


In [3]:
def stemword(word):
		x = re.findall(r'^((.*?)(लाई|ले|लागि|बाट|देखि|को|की|का|मा|माथि|कै|हरु|हरू|मै|न्ने|सँग|सँगै|वटा))$', word)
		if x:
			y = re.findall(r'^((.*?)(लाई|ले|लागि|बाट|देखि|को|की|का|मा|माथि|कै|हरु|हरू|मै|न्ने|सँग|सँगै|वटा))$', x[0][1])
			if y:	
				return y[0][1]
			else:
				return x[0][1]
		elif word.replace("'","").replace(" ",'').strip():
			return word.replace("'","").replace(" ",'').strip()
		else:
			return None

In [4]:
def nepali_tokenize(text):
    colon_lexicon = ['अंशत:', 'मूलत:', 'सर्वत:', 'प्रथमत:', 'सम्भवत:', 'सामान्यत:', 'विशेषत:', 'प्रत्यक्षत:',
        				'मुख्यत:', 'स्वरुपत:', 'अन्तत:', 'पूर्णत:', 'फलत:', 'क्रमश:', 'अक्षरश:', 'प्रायश:',
        				'कोटिश:', 'शतश:', 'शब्दश:','अत:']

        # Handling punctuations: , " ' ) ( { } [ ] ! ‘ ’ “ ” :- ? । / —
    text = re.sub('\,|\"|\'| \)|\(|\)| \{| \}| \[| \]|!|‘|’|“|”| \:-|\?|।|/|\—', ' ', text)
    words_original = text.split()

    words = []
    for word in words_original:
            if word[len(word) - 1:] == '-':
                if not word == '-':
                    words.append(word[:len(word) - 1])
            else:
                if word[len(word) - 1:] == ':' and word not in colon_lexicon:
                    words.append(word[:len(word) - 1])
                else:
                    words.append(word)

    return words

In [5]:
nepali_num = re.compile(r'(०|१|२|३|४|५|६|७|८|९)+')
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = nepali_tokenize(text)
    text_no_num = [token for token in tokens if  not nepali_num.match(token)]
    text = [stemword(word) for word in text_no_num if word not in stopwords]
    return text

In [6]:
def negCount(text):
    text = clean_text(text)
    count = sum([1 for word in text if word in negWords])
    return count

def posCount(text):
    text = clean_text(text)
    count = sum([1 for word in text if word in posWords])
    return count

In [7]:
data['negCount'] = data['body_text'].apply(lambda x: negCount(x))
data['posCount'] = data['body_text'].apply(lambda x : posCount(x))
data.head()

Unnamed: 0,body_text,label,negCount,posCount
3636,दूतावासका लागि किनिएका जग्गा अलपत्र,neg,1,0
1394,आइपीएल : सन्दीपको बलमा पञ्जाबका कुरान आउट,pos,0,0
532,"गोल्डेन बुटमा लुकाकुको पनि दाबेदारी, रोनाल्डोस...",pos,0,0
4548,रासायनिक मल बिक्री,pos,0,0
2998,विप्लवलाई प्रतिवन्ध समाधान होइन : कांग्रेस,pos,0,0


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['body_text','negCount','posCount']], data['label'], test_size=0.2)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range =(1,4))
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['negCount','posCount']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['negCount','posCount']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)


In [10]:
def train_gb(x, y):

    gb = GradientBoostingClassifier(n_estimators=250, max_depth=31, learning_rate = 0.05,
                                    max_features='sqrt',subsample = 0.95, random_state =10)
    start = time.time()
    gb_model = gb.fit(x, y)
    end = time.time()
    fit_time = end - start
    return gb_model, fit_time

In [11]:
model,fit_time = train_gb(X_train_vect, y_train)
start = time.time()
y_pred = model.predict(X_test_vect)
end = time.time()
pred_time = end - start

In [12]:
precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='neg', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3),
        round((y_pred == y_test).sum() / len(y_pred), 3)))

Fit time: 122.045 / Predict time: 0.756 ---- Precision: 0.811 / Recall: 0.729 / Accuracy: 0.799


In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)


array([[443, 140],
       [ 93, 605]], dtype=int64)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.83      0.76      0.79       583
         pos       0.81      0.87      0.84       698

   micro avg       0.82      0.82      0.82      1281
   macro avg       0.82      0.81      0.82      1281
weighted avg       0.82      0.82      0.82      1281



In [20]:
import pickle
pickle.dump(tfidf_vect_fit, open('final_vector.sav', 'wb'))
pickle.dump(model, open('final_gbc.sav', 'wb'))

ModuleNotFoundError: No module named 'xgboost'