In [None]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install sklearn

In [80]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import re # for regular expression
import string
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter 

%matplotlib inline

In [None]:
# change the seeting of the cells
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [82]:
# read file and save it in a data frame
data_df=pd.read_excel('/content/datasetMovie.xltx') 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Exploration

In [None]:
len(data_df)

In [None]:
data_df.columns

In [None]:
data_df.count()

In [None]:
data_df.info()

In [None]:
data_df.describe()

In [None]:
data_df.head()

# Data Cleaning

In [90]:
# remove punctuations  
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
#this funcation take txet as input and return the text after removing punctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)  

In [None]:
data_df.sample(3)

In [92]:
def normalize_arabic(text):
    
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    text = re.sub("ؤ", "و", text)

    return text

In [None]:
data_df.sample(3)

In [94]:
 
def processPost(tweet):
    #Replace @username with empty string
    tweet = re.sub('@[^\s]+', ' ', tweet)
    
    #Convert www.* or https?://* to " "
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # remove punctuations
    tweet= remove_punctuations(tweet)
    
    # normalize the tweet
    #check if the word in the dictionary list (it have two letters as part of the word)
    tweet= normalize_arabic(tweet)
    
    
    #remove numbers
    tweet = ''.join(i for i in tweet if not i.isdigit())
    #remove english letters
    tweet= re.sub(r'[a-z]+'," ", tweet)
    tweet= re.sub(r'[A-Z]+'," ", tweet)
    
    
    return tweet

In [95]:
data_df["clean text"] =data_df["Text"].apply(lambda x: processPost(x))

In [96]:
data_df["text length"] = data_df["clean text"].apply(len)

In [None]:
data_df.sample(3)

In [98]:
tokenizer = RegexpTokenizer(r'\w+')
data_df["tokens"] = data_df["clean text"].apply(tokenizer.tokenize)

In [None]:
data_df.head()

In [None]:
all_words = [word for tokens in data_df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data_df["tokens"]]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

In [101]:
# count the number of words
word_counter = Counter(all_words)

In [None]:
# show the most common words
word_counter.most_common(10)

In [None]:
# Display 10 least common lines
word_counter.most_common()[-10:]

# Descriptive analysis

In [None]:
data_df.shape

In [None]:
data_df.columns

In [None]:
data_df.info(verbose = True)

In [None]:
data_df[['Text','classification']].describe()

In [None]:
data_df.describe()

In [109]:
#libraries for word occurrence()
from sklearn.feature_extraction.text  import TfidfTransformer
from sklearn.feature_extraction.text  import CountVectorizer

In [None]:
#instantiate CountVectorizer()
countVec = CountVectorizer()

#generate word counts for the words
word_count_vector = countVec.fit_transform(data_df['Text'].astype('U'))
word_count_vector.shape

In [None]:
#Transform a count matrix to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
#idf values
tfidf_transformer.fit(word_count_vector)

In [None]:
#print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index = countVec.get_feature_names(),columns = ["idf_weights"])

In [None]:
#most frequent terms
df_idf.sort_values(by=["idf_weights"]).head(10)

In [None]:
#least frequent terms
df_idf.sort_values(by=["idf_weights"]).tail(10)

# Predective Statistics

In [116]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics  import confusion_matrix, classification_report
from sklearn import metrics

In [117]:
# remove data with NAN stance
data_df=data_df[~data_df['classification'].isna()]

In [118]:
# remove the "Neutral" class
data_df=data_df[data_df['classification'] != "Neutral"]

In [None]:
data_df.head(5)

In [120]:
# change values to numeric
data_df['classification'] = data_df['classification'].map({'Positive':1, 'Negative':0 }) 

In [None]:
data_df.head(5)

In [122]:
# idneitfy the data and the labels
data= data_df['clean text']
target= data_df['classification']

In [None]:
# Use TfidfVectorizer for feature extraction (TFIDF to convert textual data to numeric form):
tf_vec = TfidfVectorizer()
X = tf_vec.fit_transform(data)
X.shape

In [124]:
# Training Phase
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.50, random_state=0)

In [None]:
print("Training: ", X_train.shape, y_train.shape)
print("Testing: ", X_test.shape, y_test.shape)

# Classifier: Naive Bayes


In [None]:
# create the classifer and fit the training data and lables
classifier_nb = MultinomialNB().fit(X_train.todense(),y_train)

print("MultinomialNB accuracy: %.2f"%classifier_nb.score(X_test.todense(), y_test))
print('_'*100)

#do a 10 fold cross-validation 
results_nb = cross_val_score(classifier_nb, X.todense(),target, cv=10)
print("\n10-fold cross-validation:")
print(results_nb)

print("The average accuracy of the MultinomialNB classifier is : %.2f" % np.mean(results_nb))
print('_'*100)

print("\nConfusion matrix of the MultinomialNB classifier:")
predicted_nb = classifier_nb.predict(X_test.todense())
print(confusion_matrix(y_test,predicted_nb))
print('_'*100)

print("\nClassification_report of MultinomialNB classifier:")
print(classification_report(y_test,predicted_nb))
print('_'*100)

In [127]:
# calculate the fpr and tpr for all thresholds of the classification
probs = classifier_nb.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
# polt the AUC
plt.title('Receiver Operating Characteristic SVM classifier')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()