In [None]:
!pip install demoji

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import os
import demoji
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer

In [None]:
df_train = pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('arabic')
translator = str.maketrans('', '', string.punctuation)


In [None]:
def removeStopWords(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]  
    text = ' '.join([i for i in filtered_sentence])
    return text
def NormalizeArabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return text
def arabic_diacritics(text):
    arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', text)
    return text

def removeNumbers(text):
    """ Removes integers """
    text = ''.join([i for i in text if not i.isdigit()])         
    return text

def stemming(text):
    st = ISRIStemmer()
    stemmed_words = []
    word_tokens = word_tokenize(text) 
    for w in word_tokens:
        stemmed_words.append(st.stem(w))
    stemmed_words = " ".join(stemmed_words)
    return stemmed_words

def remove_english_characters(text):
        return re.sub(r'[a-zA-Z]+', '', text)

In [None]:
for index, row in df_train.iterrows():
    row['comment'] = removeStopWords(row['comment'])
    row['comment'] = NormalizeArabic(row['comment'])
    row['comment'] = arabic_diacritics(row['comment'])
    row['comment'] = removeNumbers(row['comment'])
    row['comment'] = row['comment'].translate(translator)
    row['comment'] = stemming(row['comment'])
    new_df = pd.DataFrame({'comment': [row['comment']]}, index=[index])
    df_train.update(new_df)
#remove emoji     
df_train['comment']=df_train['comment'].apply(lambda x: demoji.replace(x,""))


In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(df_train['comment'], df_train['label'], df_train.index, test_size=0.20, random_state=20)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, encoding='latin-1',norm='l2', ngram_range=(1,2))
features = tfidf.fit_transform(df_train['comment']).toarray()
labels = df_train['label']

In [None]:
from sklearn.svm import SVC  
model=SVC(C= 10, gamma=1, kernel='sigmoid')
clf=model.fit(features,labels)

In [None]:
data = df_test['comment']
df_unseen = pd.DataFrame(data=data)
df_unseen_1 = pd.DataFrame(data=data)
for index, row in df_unseen.iterrows():
    row['comment'] = removeStopWords(row['comment'])
    row['comment'] = NormalizeArabic(row['comment'])
    row['comment'] = arabic_diacritics(row['comment'])
    row['comment'] = removeNumbers(row['comment'])
    row['comment'] = row['comment'].translate(translator)
    row['comment'] = stemming(row['comment'])
    new_df = pd.DataFrame({'comment': [row['comment']]}, index=[index])
    df_unseen.update(new_df)
txt= tfidf.transform(df_unseen['comment']).toarray()
   

In [None]:
csv_columns = ['id','label']
csv_file = "Sample_Submission.csv"
y_pred = clf.predict(txt)
ids = [i for i in range(1,241)]
ids = pd.DataFrame(data=ids)
pred = pd.DataFrame(data=y_pred)
data=pd.concat([ids,pred],axis=1)
print(data)
data.to_csv(csv_file,sep=',',header=['id','label'],index=False)