In [1]:
import pandas as pd
import nltk 
import re
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix

In [2]:
clfs=[GaussianNB,SVC,LinearSVC,LogisticRegression,SGDClassifier]

In [3]:
#reading a data
dataset = pd.read_csv("RES1.csv",encoding="utf-8")
dataset.head()

Unnamed: 0,polarity,text,restaurant_id,user_id
0,-1,اولا: المنيو تغير الشورما اصبحت اعتياديه بأختف...,296,423
1,-1,من محلات الشاورما ذات الشعبيه لتميز الصلصات ال...,296,423
2,1,دجاج طازج يحضر امامك على الطلب لا يقوم باعدة ا...,5027,39580
3,1,فكما تعرف أستراليا بالكنغر والكوالا. فإنها تعر...,642,444
4,-1,إسمحو لي أن أقيم مطعم هاشم بصفتي فلسطيني عشت ف...,434,2191


In [4]:
x1,y1=dataset.shape

In [5]:
#proccising a data 1-cleaing anything not arabic alphpit 2- stopword 3- steming 4- returning data
def preprocess(data):
    st=ISRIStemmer()
    corpus=[]
    for i in range (0,x1):
        review=re.sub('[^ء-ي]',' ',data['text'][i])
        review=word_tokenize(review)
        review = [st.stem(word) for word in review if not word in set(stopwords.words('arabic_ed'))]
        review = ' '.join(review)
        corpus.append((review,dataset['polarity'][i]))
    df_sen=pd.DataFrame(corpus,columns={'text','polarity'})
    return df_sen

In [6]:
def featureExtraction(df_sen):
    cv = CountVectorizer(max_features = 5000)
    X = cv.fit_transform(df_sen.iloc[:,0]).toarray()
    y = df_sen.iloc[:, 1].values
    return X,y

In [7]:
# Fitting Naive Bayes to the Training set
def model(clf,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    classifier = clf()
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("Accuracy = ",(cm[0,0]+cm[1,1])/(cm.sum()))

In [8]:
data=preprocess(dataset)
X,y=featureExtraction(data)

In [None]:
clfs=[GaussianNB,SVC,LinearSVC,LogisticRegression,SGDClassifier]
for clf in clfs:
    model(clf,X,y)

#most word common
word=nltk.FreqDist(corpus)
word.most_common(10)