In [119]:
import collections
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.metrics import precision_score
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
import joblib
import re
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.calibration import CalibratedClassifierCV
import pickle
import dill


In [2]:
_DEFAULT_LABELS=['negative','positive','neutral']

In [121]:
class SentimentIdentificationML(object):

  """A class for training, evaluating and running the sentiment classification
     After initializing an instance, you must
    run the train method once before using it.
    Args:
        labels (:obj:`set` of :obj:`str`, optional): The set of sentiment labels
            used in the training data in the main model.
            If None, the default labels are used.
            Defaults to None.
       
    """
  def __init__(self, labels=None,
                 ):
        if labels is None:
            labels = _DEFAULT_LABELS
        self._labels_sorted = sorted(labels)
        self._is_trained = False

  def _apply_preprocessing(self,text):

      # ref: https://github.com/bakrianoo/aravec
      search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
                "\\",'\n', '\t','&quot;','?','؟','!']
      replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
                "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
      
      tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
      text = re.sub(tashkeel,"", text)
      
      longation = re.compile(r'(.)\1+')
      subst = r"\1\1"
      text = re.sub(longation, subst, text)
      
      text = re.sub(r"[^\w\s]", '', text)
      text = re.sub(r"[a-zA-Z]", '', text)
      text = re.sub(r"\d+", ' ', text)
      text = re.sub(r"\n+", ' ', text)
      text = re.sub(r"\t+", ' ', text)
      text = re.sub(r"\r+", ' ', text)
      text = re.sub(r"\s+", ' ', text)
      text = text.replace('وو', 'و')
      text = text.replace('يي', 'ي')
      text = text.replace('اا', 'ا')
      
      for i in range(0, len(search)):
          text = text.replace(search[i], replace[i])
      
      text = text.strip()


      
      return text


  def _prepare_sentences(self, sentences):
      cleaned_sent=  sentences.apply(self._apply_preprocessing)             
      sent_array = np.array(cleaned_sent)
      if not self._is_trained:
        x_trans = self._feat_union.fit_transform(sent_array)
      
      else:
        x_trans = self._feat_union.transform(sent_array)

      return x_trans

  def  use_defult_fearure_vec(self):

      
      union = FeatureUnion([("tf_idf_ngra_3", TfidfVectorizer(min_df=1,ngram_range=(1, 3),binary=True,max_features=700)),
                        ( "tf_idf_ngra_2", TfidfVectorizer(min_df=1,ngram_range=(1, 2),binary=True,max_features=700)),
                      ("tf_idf_ngra_1",TfidfVectorizer(min_df=1,ngram_range=(1, 1),binary=True,max_features=700)),
                      ("tf_idf_ngra_5", TfidfVectorizer(min_df=1,ngram_range=(1, 5),binary=True,max_features=700)),
                      ("tf_idf_ngra_7", TfidfVectorizer(min_df=1,ngram_range=(1, 7),binary=True,max_features=700)),
                      ("tf_idf_ngra_3_ch",TfidfVectorizer(min_df=1,ngram_range=(1, 3),binary=True,max_features=700,analyzer='char')),
                      ("tf_idf_ngra_2_ch",TfidfVectorizer(min_df=1,ngram_range=(1, 2),binary=True,max_features=700,analyzer='char')),
                      ("tf_idf_ngra_1_ch",TfidfVectorizer(min_df=1,ngram_range=(1, 1),binary=True,max_features=700,analyzer='char')),
                      ("tf_idf_ngra_5_ch",TfidfVectorizer(min_df=1,ngram_range=(1, 5),binary=True,max_features=700,analyzer='char')),
                      ("tf_idf_ngra_7_ch", TfidfVectorizer(min_df=1,ngram_range=(1, 7),binary=True,max_features=700,analyzer='char')),
                      ("cnt_vec_ngra_3", CountVectorizer(min_df=1,ngram_range=(1, 3),binary=True,max_features=700)),
                      ("cnt_vec_ngra_5", CountVectorizer(min_df=1,ngram_range=(1, 5),binary=True,max_features=700))
                                      
                                      
                                      ])
      self._feat_union = union

  def use_recomended_feature_vec(self,char_ngram_range,word_ngram_range):
      word_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=word_ngram_range,
                                          analyzer='word',
                                          tokenizer=lambda x: x.split(' '))
      char_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=char_ngram_range,
                                          analyzer='char',
                                          tokenizer=lambda x: x.split(' '))
      self._feat_union = FeatureUnion([('wordgrams', word_vectorizer),
                                               ('chargrams', char_vectorizer)])
        


  

  def train(self, X_train,
              y_train,
              use_defult_fearures=False,
              char_ngram_range=(1, 3),
              word_ngram_range=(1, 1),
              n_jobs=None):
        """Trains the model on a given data set.
        Args:
            X_train (:obj:`np array or pandas series`, optional): loaded training data.
               
            y_train (:obj:`np array or pandas series`, optional): loaded labels for training.

            use_defult_fearures (:obj:'bool', optional): Use recommended feature vector 


            char_ngram_range (:obj:`tuple`, optional): The n-gram ranges to
                consider in the character-based language models.
                Defaults to (1, 3).
            word_ngram_range (:obj:`tuple`, optional): The n-gram ranges to
                consider in the word-based language models.
                Defaults to (1, 1).
            n_jobs (:obj:`int`, optional): The number of parallel jobs to use
                for computation. If None, then only 1 job is used.
                If -1 then all processors are used. Defaults to None.
        """

       

        
       

        # Build and train extra classifier
        if use_defult_fearures:
          self.use_defult_fearure_vec()
        else:
          self.use_recomended_feature_vec(char_ngram_range,word_ngram_range)   

        x_prepared = self._prepare_sentences(X_train)    
       
       

        self._classifier = CalibratedClassifierCV(LinearSVC(C=0.01, class_weight='balanced', penalty='l2'))
        self._classifier.fit(x_prepared, y_train)
        self._is_trained = True

  def eval(self,  X_eval,
              y_eval, data_set='DEV'):
        """Evaluate the trained model on a given data set.
        Args:
            X_eval (:obj:`np array or pandas series`, optional): loaded data for evaluation.

            y_eval (:obj:`np array or pandas series`, optional): loaded labels for evaluation.

            data_set (:obj:`str`, optional): Name of the provided data set to
                use. This is ignored if data_path is not None. Can be either
                'VALIDATION' or 'TEST'. Defaults to 'VALIDATION'.
        Returns:
            :obj:`dict`: A dictionary mapping an evaluation metric to its
            computed value. The metrics used are accuracy, f1_micro, f1_macro,
            recall_micro, recall_macro, precision_micro and precision_macro.
        """

        if not self._is_trained:
            raise UntrainedModelError(
                'Can\'t evaluate an untrained model.')

       

        # # prepare data
        # eval_data = self._prepare_sentences(X_eval)
        

        # Generate predictions
        did_pred = self.predict(X_eval)
        did_pred = [d.top for d in did_pred]
       

        # Get scores
        scores = {
            'Sentiment': {
                'accuracy': accuracy_score(y_eval, did_pred),
                'f1_macro': f1_score(y_eval, did_pred,
                                     average='macro'),
                'recall_macro': recall_score(y_eval, did_pred,
                                             average='macro'),
                'precision_macro': precision_score(y_eval,
                                                   did_pred,
                                                   average='macro')
            }
        }

        return scores    

  def predict(self, sentences, output='label'):
        """Predict the sentiment  probability scores for a given list of
        sentences.
        Args:
            sentences (:obj:`list` of :obj:`str`): The list of sentences.
            output (:obj:`str`): The output label type. Possible values are
                'postive', 'neagtive', 'neutral'.
        Returns:
            :obj:`list` of :obj:`DIDPred`: A list of prediction results,
            each corresponding to its respective sentence.
        """

        if not self._is_trained:
            raise UntrainedModelError(
                'Can\'t predict with an untrained model.')

        
        if isinstance(sentences, str):
          sentences=pd.Series(sentences)
        x_prepared = self._prepare_sentences(sentences)
        predicted_scores = self._classifier.predict_proba(x_prepared)
        convert = lambda x: x

        result = collections.deque()
        for scores in predicted_scores:
            score_tups = list(zip(self._labels_sorted, scores))
            predicted_sentiment = max(score_tups, key=lambda x: x[1])[0]
            sentiment_scores = dict(score_tups)
            result.append(convert(DIDPred(predicted_sentiment, sentiment_scores)))

        return list(result)

  def save_model(self,path):
      """Save  the model on a given data set.
        Args:
            Path (:obj:`str`): Path where you want to save the model.
               
           
        """
      joblib.dump(self._classifier, path+'/svm_model.sav')
  def save_feature_vec(self,path):
    """Save  the feature vector on a given data set.
      Args:
          Path (:obj:`str`): Path where you want to save the feature vector .
              
          
      """
    
    # torch.save(obj=self._feat_union,
    #     f=path+'/feat_vec.pkl',
    #     pickle_module=dill
    # )
    dill.dump(self._feat_union, open( path+"/feat_vec.p", "wb" ))
    # pickle.dump(self._feat_union.get_params(), open( path+"/feat_vec.p", "wb" ) )
    # joblib.dump(pipeline, 'filename.pkl', compress = 1)

    # joblib.dump(self._feat_union.get_params(), path+'/feat_vec.pkl', compress = 1)




In [6]:
class UntrainedModelError(SentimentIdentificationML):
    """Error thrown when attempting to use an untrained sentiment classifier
    instance.
    """

    def __init__(self, msg):
        DialectIdError.__init__(self, msg)

In [54]:
class DIDPred(collections.namedtuple('DIDPred', ['top', 'scores'])):
    """A named tuple containing sentiment ID prediction results.
    Attributes:
        top (:obj:`str`): The sentiment label with the highest score. See
            :ref:`sentimentid_labels` for a list of output labels.
        scores (:obj:`dict`): A dictionary mapping each sentiment label to it's
            computed score.
    """

In [130]:
MODEL_PATH='/content/drive/MyDrive/Omdena_sentiment/Saved_models/Production/svm_model.sav'
FEATURE_VEC_PATH='/content/drive/MyDrive/Omdena_sentiment/Saved_models/Production/feat_vec.p'

In [141]:
class SentimentIdentificationMLInfereing(object):
  """A class for running the sentiment classification based on pretrained ML model
     
    Args:
        labels (:obj:`set` of :obj:`str`, optional): The set of dialect labels
            used in the training data in the main model.
            If None, the default labels are used.
            Defaults to None.

        training_model_path (:obj:`str`, optional): Path of training model to be used for inference,
        If none, use defult model for this libaray
        feature_vector_path (:obj:`str`, optional): Path of feature vector to be used for preparing data,
        If none, use defult model for this libaray
       
    """
  def __init__(self, labels=None,training_model_path=None,feature_vector_path=None,
                 ):
        if labels is None:
            labels = _DEFAULT_LABELS
        self._labels_sorted = sorted(labels)
        if feature_vector_path is None:
          self._feat_union=dill.load(open(FEATURE_VEC_PATH, 'rb'))
          # self._feat_union_params=  joblib.load(FEATURE_VEC_PATH)
          # self._feat_union=FeatureUnion()
          # self._feat_union.set_params(self._feat_union_params)
        else:
          self._feat_union=  dill.load(open(feature_vector_path, 'rb'))
        if training_model_path is None:
          self._classifier=  joblib.load(MODEL_PATH)
        else:
          self._classifier=  joblib.load(training_model_path)

  def _apply_preprocessing(self,text):

      # ref: https://github.com/bakrianoo/aravec
      search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
                "\\",'\n', '\t','&quot;','?','؟','!']
      replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
                "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
      
      tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
      text = re.sub(tashkeel,"", text)
      
      longation = re.compile(r'(.)\1+')
      subst = r"\1\1"
      text = re.sub(longation, subst, text)
      
      text = re.sub(r"[^\w\s]", '', text)
      text = re.sub(r"[a-zA-Z]", '', text)
      text = re.sub(r"\d+", ' ', text)
      text = re.sub(r"\n+", ' ', text)
      text = re.sub(r"\t+", ' ', text)
      text = re.sub(r"\r+", ' ', text)
      text = re.sub(r"\s+", ' ', text)
      text = text.replace('وو', 'و')
      text = text.replace('يي', 'ي')
      text = text.replace('اا', 'ا')
      
      for i in range(0, len(search)):
          text = text.replace(search[i], replace[i])
      
      text = text.strip()


      
      return text


  def _prepare_sentences(self, sentences):

        
      cleaned_sent=  sentences.apply(self._apply_preprocessing)             
      sent_array = np.array(cleaned_sent)       
      x_trans = self._feat_union.transform(sent_array)        
      return x_trans


  def eval(self,  X_eval,
              y_eval, data_set='DEV'):
        """Evaluate the trained model on a given data set.
        Args:
            X_eval (:obj:`np array or pandas series`, optional): loaded data for evaluation.

            y_eval (:obj:`np array or pandas series`, optional): loaded labels for evaluation.

            data_set (:obj:`str`, optional): Name of the provided data set to
                use. This is ignored if data_path is not None. Can be either
                'VALIDATION' or 'TEST'. Defaults to 'VALIDATION'.
        Returns:
            :obj:`dict`: A dictionary mapping an evaluation metric to its
            computed value. The metrics used are accuracy, f1_micro, f1_macro,
            recall_micro, recall_macro, precision_micro and precision_macro.
        """

        

        # # prepare data
        # eval_data = self._prepare_sentences(X_eval)
        

        # Generate predictions
        did_pred = self.predict(X_eval)
        did_pred = [d.top for d in did_pred]
        
       

        # Get scores
        scores = {
            'Sentiment': {
                'accuracy': accuracy_score(y_eval, did_pred),
                'f1_macro': f1_score(y_eval, did_pred,
                                     average='macro'),
                'recall_macro': recall_score(y_eval, did_pred,
                                             average='macro'),
                'precision_macro': precision_score(y_eval,
                                                   did_pred,
                                                   average='macro')
            }
        }

        return scores    

  def predict(self, sentences, output='label'):
        """Predict the sentiment  probability scores for a given list of
        sentences.
        Args:
            sentences (:obj:`list` of :obj:`str`): The list of sentences.
            output (:obj:`str`): The output label type. Possible values are
                'postive', 'neagtive', 'neutral'.
        Returns:
            :obj:`list` of :obj:`DIDPred`: A list of prediction results,
            each corresponding to its respective sentence.
        """
        if isinstance(sentences, str):
          sentences=pd.Series(sentences)
        x_prepared = self._prepare_sentences(sentences)
        predicted_scores = self._classifier.predict_proba(x_prepared)
        convert = lambda x: x

        result = collections.deque()
        for scores in predicted_scores:
            score_tups = list(zip(self._labels_sorted, scores))
            predicted_sentiment = max(score_tups, key=lambda x: x[1])[0]
            sentiment_scores = dict(score_tups)
            result.append(convert(DIDPred(predicted_sentiment, sentiment_scores)))

        return list(result)

# Testing Production Code

In [9]:
df=pd.read_csv('/content/drive/MyDrive/Omdena_sentiment/Dataset/final_text.csv')

In [11]:
df.head(5)

Unnamed: 0,final,label,length
0,رجل يرفع شعار الحريه يدعو لرفع الظلم المراه او...,2,31
1,صفاء الهاشم سيده كويتيه المراه الوحيده حاليا م...,2,27
2,حقوق المراه عينك مو دكتور واحدحثاله بلدنا بلد ...,2,40
3,شخصيا حقوق المراه لانو بالجد الدستور السوداني ...,2,42
4,حق حقوق المراه مثل الرجل يريد وحده زينه واخلاق...,2,21


In [10]:
msk = np.random.rand(len(df)) < 0.7
train = df[msk]
test = df[~msk]

In [11]:
msk = np.random.rand(len(train)) < 0.8
train_new = train[msk]
valid = train[~msk]


In [12]:
train_new.head()

Unnamed: 0,final,label,length
0,رجل يرفع شعار الحريه يدعو لرفع الظلم المراه او...,2,31
2,حقوق المراه عينك مو دكتور واحدحثاله بلدنا بلد ...,2,40
7,خيركم خيركم لاهله تكثر مخاطبه الرجال حقوق المر...,2,27
8,اليوم دخلت نقاش حلو حقوق المراه و دكتور مطوع ض...,2,22
9,صباح النيات المول انعاشها وتحقيقها يدي سمو سيد...,2,36


In [13]:
len(train_new)

128770

In [15]:
valid.head()

Unnamed: 0,final,label,length
10,تطور كبير عم حسه شان حقوق واجبات المراه الامار...,2,17
12,التعدد مطلبنا اشوف تعدد الزوجات يتسبب سعاده ال...,2,21
15,دراسات علميه اوضحت دراسات علميه بان تعدد الزوج...,2,39
18,اجمل سعوديه ولامغربيه السعوديه المغربيه تكفي ل...,2,16
27,تعدد الزوجات فوائد والمستفيد الاول زوجته الاول...,2,9


In [16]:
test.head()

Unnamed: 0,final,label,length
1,صفاء الهاشم سيده كويتيه المراه الوحيده حاليا م...,2,27
2,حقوق المراه عينك مو دكتور واحدحثاله بلدنا بلد ...,2,40
5,قرارات عدليه هامه عززت حقوق المراه,2,6
13,التعدد مطلبنا تعدد الزوجات حق مشروع حقوق الزوج...,2,23
14,تعدد الزوجات نعمه مغبون كثير الناس عددوا واعدل...,2,11


In [14]:
labels_numeric=[0,1,2]

In [15]:
train_new.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [132]:
test.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [122]:
sentimenclassifier=SentimentIdentificationML(labels_numeric)

In [123]:
sentimenclassifier.train(train_new['final'],train_new['label'],use_defult_fearures=False)

In [19]:
valid.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
valid.head()

Unnamed: 0,final,label,length
10,تطور كبير عم حسه شان حقوق واجبات المراه الامار...,2,17
16,رايكم تعدد الزوجات والله شي حلو يوم تذوق شي,2,9
25,تعدد الزوجات باب عظيم ابواب السعاده والخير وال...,2,19
40,خلاص الحمداله فرج الله عهد ملكنا سلمان عهده تو...,2,23
53,بارك الله امثالها وشافاها الله والدها الفتاه ا...,2,17


In [127]:
sentimenclassifier.eval(valid['final'],valid['label'])

{'Sentiment': {'accuracy': 0.7109357946926939,
  'f1_macro': 0.6726495586133651,
  'precision_macro': 0.7134542054825949,
  'recall_macro': 0.6676578272000081}}

In [90]:
test.iloc[0]['final']

'صفاء الهاشم سيده كويتيه المراه الوحيده حاليا مجلس الامه الكويتي مدافعه شرسه حقوق المراه وحق المواطن الكويتي وتمتلك عقليه اقتصاديه مميزه اختيرت ضمن سيده عربيه مؤثره مجتمعها'

In [126]:
sentimenclassifier.predict('صفاء الهاشم سيده كويتيه المراه الوحيده حاليا مجلس الامه الكويتي مدافعه شرسه حقوق المراه وحق المواطن الكويتي وتمتلك عقليه اقتصاديه مميزه اختيرت ضمن سيده عربيه مؤثره مجتمعها')

[DIDPred(top=2, scores={0: 0.3142449591766248, 1: 0.2719087116949098, 2: 0.4138463291284653})]

In [124]:
sentimenclassifier.save_feature_vec('/content/drive/MyDrive/Omdena_sentiment/Saved_models/Production')

In [125]:
sentimenclassifier.save_model('/content/drive/MyDrive/Omdena_sentiment/Saved_models/Production')

In [142]:
InfereSentiment=SentimentIdentificationMLInfereing(labels_numeric)

In [143]:
InfereSentiment.eval(test['final'],test['label'])

{'Sentiment': {'accuracy': 0.7154780847678287,
  'f1_macro': 0.6785456751911916,
  'precision_macro': 0.7187791701771257,
  'recall_macro': 0.6727025645388777}}