<a href="https://colab.research.google.com/github/MohamedHesham02/Sentiment-Analysis-Arabic-Tweets/blob/main/sentiment_analysis_data_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
import sklearn 
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/MyDrive/final.csv')  
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,dialect,tweet
0,0,0,0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT ููู ุจุงูููุงูุฉ .. ููุชูุถ .. ูุบูุฑ .
1,1,1,1,1175416117793349632,IQ,@7zNqXP0yrODdRjK ูุนูู ูุฐุง ูุญุณูุจ ุนูู ุงูุจุดุฑ .. ุญ...
2,2,2,2,1175450108898565888,IQ,@KanaanRema ูุจูู ูู ููุงูู ุฎููุฌู
3,3,3,3,1175471073770573824,IQ,@HAIDER76128900 ูุณูููู ูุฑูุฑู ูุฑูุญู ุงูุญููู๐
4,4,4,4,1175496913145217024,IQ,@hmo2406 ููู ูู ุงูุบูุจู ุงุฎ ูุญูุฏ ๐ธ๐บ


In [4]:
del data['Unnamed: 0']
del data['Unnamed: 0.1']
del data['Unnamed: 0.1.1']

In [5]:
data = data.take(np.random.permutation(len(data))[:])

In [6]:
data.shape

(20833, 3)

In [7]:
data.isna().sum()

id         0
dialect    0
tweet      0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20833 entries, 10531 to 2993
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20833 non-null  int64 
 1   dialect  20833 non-null  object
 2   tweet    20833 non-null  object
dtypes: int64(1), object(2)
memory usage: 651.0+ KB


# Text Pre-processing of Arabic letters

In [9]:
# Arabic letters encodings

COMMA = u'\u060C'
SEMICOLON = u'\u061B'
QUESTION = u'\u061F'
HAMZA = u'\u0621'
ALEF_MADDA = u'\u0622'
ALEF_HAMZA_ABOVE = u'\u0623'
WAW_HAMZA = u'\u0624'
ALEF_HAMZA_BELOW = u'\u0625'
YEH_HAMZA = u'\u0626'
ALEF = u'\u0627'
TEH_MARBUTA = u'\u0629'
TATWEEL = u'\u0640'
LAM = u'\u0644'
HEH = u'\u0647'
YEH = u'\u064a'
ALEF_MAKSURA = u'\u0649'
HAMZA_ABOVE = u'\u0654'
HAMZA_BELOW = u'\u0655'
PERCENT = u'\u066a'
DECIMAL = u'\u066b'
THOUSANDS = u'\u066c'
STAR = u'\u066d'
FULL_STOP = u'\u06d4'
BYTE_ORDER_MARK = u'\ufeff'
MULITIPLICATION_SIGN = u'\u00D7'
DIVISION_SIGN = u'\u00F7'


# Tashkeel
FATHATAN = u'\u064b'
DAMMATAN = u'\u064c'
KASRATAN = u'\u064d'
FATHA = u'\u064e'
DAMMA = u'\u064f'
KASRA = u'\u0650'
SHADDA = u'\u0651'
SUKUN = u'\u0652'

#Ligatures
LAM_ALEF = u'\ufefb'
LAM_ALEF_HAMZA_ABOVE = u'\ufef7'
LAM_ALEF_HAMZA_BELOW = u'\ufef9'
LAM_ALEF_MADDA_ABOVE = u'\ufef5'

HARAKAT_PAT = re.compile(u"["+u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA])+u"]")
HAMZAT_PAT = re.compile(u"["+u"".join([WAW_HAMZA, YEH_HAMZA])+u"]")
ALEFAT_PAT = re.compile(u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW])+u"]")
LAMALEFAT_PAT = re.compile(u"["+u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE])+u"]")

WESTERN_ARABIC_NUMERALS = ['0','1','2','3','4','5','6','7','8','9']
EASTERN_ARABIC_NUMERALS = [u'\u06F0', u'\u06F1', u'\u06F2', u'\u06F3', u'\u0664', u'\u06F5', u'\u0666', u'\u06F7', u'\u06F8', u'\u06F9']

eastern_to_western_numerals = {}
for i in range(len(EASTERN_ARABIC_NUMERALS)):
    eastern_to_western_numerals[EASTERN_ARABIC_NUMERALS[i]] = WESTERN_ARABIC_NUMERALS[i]

arabic_punctuations = COMMA + SEMICOLON + QUESTION + PERCENT + DECIMAL + THOUSANDS + STAR + FULL_STOP + MULITIPLICATION_SIGN + DIVISION_SIGN
all_punctuations = string.punctuation + arabic_punctuations + '()[]{}' # English & Arabic punctuations

all_punctuations = ''.join(list(set(all_punctuations)))

# Text Cleaning 
### Removal of unrequired terms of sentences that could have a negative effect on the training process (e.g: tags, hashtags, urls, email addresses, .....)

In [10]:
def strip_tashkeel(text):
    text = HARAKAT_PAT.sub('', text)
    return text 

def strip_tatweel(text):
    return re.sub(u'[%s]' % TATWEEL, '', text)

def remove_non_arabic(text):
    return ' '.join(re.sub(u"[^\u0621-\u063A\u0640-\u0652 ]", " ", text,  flags=re.UNICODE).split())

def keep_arabic_english_n_symbols(text):
    return ' '.join(re.sub(u"[^\u0621-\u063A\u0640-\u064aa-zA-Z#@_:/ ]", "", text,  flags=re.UNICODE).split())

def normalize_hamza(text):
    text = ALEFAT_PAT.sub(ALEF, text)
    text = HAMZAT_PAT.sub(HAMZA, text)
    return text

def normalize_lamalef(text): 
    text = LAMALEFAT_PAT.sub((LAM, ALEF), text)
    return text

def normalize_spellerrors(text):
    text = re.sub(u'[%s]' % TEH_MARBUTA, HEH, text)
    return re.sub(u'[%s]' % ALEF_MAKSURA, YEH, text)

def remove_underscore(text):
    return ' '.join(text.split('_'))

def remove_retweet_tag(text):
    tag = re.compile('rt @[a-zA-Z0-9_]+:|@[a-zA-Z0-9_]+')
    text = tag.sub('', text)
    hashtag = re.compile('\#')
    text = hashtag.sub('', text)
    return text

def replace_emails(text):
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    for email in emails:
        text = text.replace(email,'#')
    return text

def replace_urls(text):
    return re.sub(r"http\S+|www.\S+", "#", text)

def convert_eastern_to_western_numerals(text):
    for num in EASTERN_ARABIC_NUMERALS:
        text = text.replace(num, eastern_to_western_numerals[num])
    return text

def remove_all_punctuations(text):
    for punctuation in all_punctuations:
        text = text.replace(punctuation, ' ')
    return text

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags 
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text)
    return text

def replace_phone_numbers(text):
    return re.sub(r'\d{10}', '#', text)

def remove_extra_spaces(text):
    return ' '.join(text.split())

def normalize_tweet(text):
    new_text = text.lower()
    new_text = normalize_hamza(new_text)
    new_text = strip_tashkeel(new_text)
    new_text = strip_tatweel(new_text)
    new_text = normalize_spellerrors(new_text)
    new_text = remove_retweet_tag(new_text)
    new_text = replace_emails(new_text)
    new_text = remove_underscore(new_text)
    new_text = replace_phone_numbers(new_text)
    new_text = remove_all_punctuations(new_text)
    new_text = replace_urls(new_text)
    new_text = convert_eastern_to_western_numerals(new_text)
    new_text = keep_arabic_english_n_symbols(new_text)
    new_text = remove_non_arabic(new_text)
    new_text = remove_extra_spaces(new_text)
    
    return new_text

In [11]:
test= " ููุฏ ูุงู ุฃูู ูุฃูู ูุฃูู ูุคูู"

def normalize_hamza(text):
    text = ALEFAT_PAT.sub(ALEF,text)
    text = HAMZAT_PAT.sub(HAMZA,text)
    return text

result = normalize_hamza(test)
print(result)

 ููุฏ ูุงู ุงูู ูุงูู ูุงูู ูุกูู


In [12]:
data['tweet'] = data['tweet'].apply(normalize_tweet)
data.head()

Unnamed: 0,id,dialect,tweet
10531,1132446517996998784,SA,ุงููู ูุตูุญู ูููุช ุจุงูุงุซุงุฑู ุฒู ุฎูููุง ูุงููุชุจ ุงุซุฑุช ...
12916,1015690276344619008,DZ,ุงุณุญุจ ููุงุงุงุงูู ุงููู ุนููู ูุง ุงุจู ุดุนุนุนุฑ ุทูููููู
18942,1098503953204105216,AE,ุงูุง ูู ุงููุงุณ ุงููู ูุดุฑู ุงููุนุจู
20498,1027083124545646592,BH,ูุงุฒู ูุตูุฑ ุดู ุญูู ุงูููู ูุงู ุงูุชุงุฑูุฎ ูุง ูุชุทูู
10175,950283002726240384,SA,ุงููู ูุนูู ุงููู ูู ุงูููู ููุจู ุชุนูู


In [13]:
data['tweet'][5]

'ูุงุงุฎู ุงูุงุฑูุงุจู ุงุฐุง ูุงู ุนุฑุงูู ุณุนูุฏู ููุณุทููู ููู ุงููุดููู ุนูู ุจุงุจ ุงููุฑุถ ุฎููุฌู ููุงุนูุฏู ุฑุญูู ููู ุงููุณุงุณ ุจูุฏุณูุชู ุงููู ูุฑุถู ุนูู ุฎูุตุช ูุดุงูู ุงูุนุฑุจ ูุงุฎุชูููุง ุจุฌูุณูุช ุงุจู ุงูุทูู ุงููุบุฑุฏูู ุณุงููุง ูู ููู ูุฑุฌุญุช ููู ุงููุงุฑุซู ุงููู ูุตูุชู'

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
# convert tweets col to one string
tweets = data.tweet.str.cat(sep=' ')
tweets

'ุงููู ูุตูุญู ูููุช ุจุงูุงุซุงุฑู ุฒู ุฎูููุง ูุงููุชุจ ุงุซุฑุช ุนููู ุงุณุญุจ ููุงุงุงุงูู ุงููู ุนููู ูุง ุงุจู ุดุนุนุนุฑ ุทูููููู ุงูุง ูู ุงููุงุณ ุงููู ูุดุฑู ุงููุนุจู ูุงุฒู ูุตูุฑ ุดู ุญูู ุงูููู ูุงู ุงูุชุงุฑูุฎ ูุง ูุชุทูู ุงููู ูุนูู ุงููู ูู ุงูููู ููุจู ุชุนูู ุนููู ุชุงุบ ูุตุญุจุชููุจูุงูู ููุจุด ูููู ุชุดุธู ุชูุชุจ ุจุงูุถุงุฏ ุงุจู ุนุตุง ููุฐุง ุชุดุธู ูุง ููุชุจููุงุด ุงููุงุณ ุตูุนู ูุซู ุฑุงุณู ููุฐุง ุชุดุถู ูุซู ูุง ุงูุช ูุชุจุช ูู ุชุบุฑูุฏุชู ูุงูุง ุงุญุณู ุงูุชุจูุง ุจุงููุตุฑู ุชุดุฒู ูุง ุณุนุงุฏู ุงููุฒูุฑ ุฑุงุฌุน ุนููู ููุงูุฏ ุดููู ุถูุน ุญุชู ุงููุบู ุฌูุจ ุงูุญุงุฌุงุช ุงูุซุงููู ุงููู ุจุงูู ุจุงูู ูู ูู ุงููุตูุน ุชู ูุน ุงูุบุทุง ุงููู ูุญูู ุงูุดุงุดู ุนุณุจ ูุงูุฒูุฏ ุนูููุง ูุง ููุช ุญุจูุจุชู ุดููู ุงูุตูุฑ ุชููููู ุจุณ ูู

## Getting a frequency insight of words



In [16]:
import nltk
from nltk.tokenize import word_tokenize

#function to split text into word
tokens = word_tokenize(tweets)

vocabulary = set(tokens)
print('no. of words is:', len(vocabulary))

frequency_dist = nltk.FreqDist(tokens) # words frequency
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:20] # highest 20 word occurred only 

no. of words is: 64108


['ูู',
 'ูู',
 'ูุง',
 'ุนูู',
 'ุงููู',
 'ุงููู',
 'ู',
 'ุจุณ',
 'ูุง',
 'ูู',
 'ุงูุง',
 'ูุงููู',
 'ููุง',
 'ูุง',
 'ูุด',
 'ุดู',
 'ุงู',
 'ูุน',
 'ูุฐุง',
 'ุงูุช']

## Stopwords removal from all tweets that will not be an important paramter to classify the tweet

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
stop = stopwords.words('arabic')
arab_stopwords = set(nltk.corpus.stopwords.words("arabic"))
arab_stopwords

{'ุก',
 'ุกู',
 'ุข',
 'ุขุจ',
 'ุขุฐุงุฑ',
 'ุขุถ',
 'ุขูููู',
 'ุขูุงุก',
 'ุขููุง',
 'ุขู',
 'ุขูุง',
 'ุขูุงู',
 'ุขูู',
 'ุขูู',
 'ุขู',
 'ุฃ',
 'ุฃุจุฏุง',
 'ุฃุจุฑูู',
 'ุฃุจู',
 'ุฃุจู',
 'ุฃุฌู',
 'ุฃุฌูุน',
 'ุฃุญุฏ',
 'ุฃุฎุจุฑ',
 'ุฃุฎุฐ',
 'ุฃุฎู',
 'ุฃุฎู',
 'ุฃุฑุจุน',
 'ุฃุฑุจุนุงุก',
 'ุฃุฑุจุนุฉ',
 'ุฃุฑุจุนูุฆุฉ',
 'ุฃุฑุจุนูุงุฆุฉ',
 'ุฃุฑู',
 'ุฃุณูู',
 'ุฃุตุจุญ',
 'ุฃุตูุง',
 'ุฃุถุญู',
 'ุฃุทุนู',
 'ุฃุนุทู',
 'ุฃุนูู',
 'ุฃุบุณุทุณ',
 'ุฃู',
 'ุฃูุฑูู',
 'ุฃูุนู ุจู',
 'ุฃููู',
 'ุฃูุจู',
 'ุฃูู',
 'ุฃูุชูุจุฑ',
 'ุฃูุซุฑ',
 'ุฃู',
 'ุฃูุง',
 'ุฃูู',
 'ุฃููู',
 'ุฃู',
 'ุฃูุง',
 'ุฃูุงู',
 'ุฃูุงูู',
 'ุฃูุงููู',
 'ุฃูุฏ',
 'ุฃูุณ',
 'ุฃูุณู',
 'ุฃููุง',
 'ุฃู',
 'ุฃูุง',
 'ุฃูุจุฃ',
 'ุฃูุช',
 'ุฃูุชู',
 'ุฃูุชูุง',
 'ุฃูุชู',
 'ุฃูุชู',
 'ุฃูุดุฃ',
 'ุฃูู',
 'ุฃููู',
 'ุฃููู',
 'ุฃููุง',
 'ุฃู',
 'ุฃูุช',
 'ุฃูุดู',
 'ุฃูู',
 'ุฃููุฆู',
 'ุฃููุงุก',


In [19]:
arab_stopwords = '|'.join(arab_stopwords)
arab_stopwords

'ุฐุงู|ููุงุชุงูู|ุงูุฃูู|ุญูู|ููู|ุนูุฏ|ุฃูููู|ููููุง|ุขูู|ุฅู|ุง|ุฒุงู|ุขูุงุก|ููุง|ุฃูุนู ุจู|ุจุนุถ|ููุง|ูุงู|ุฃููุง|ููู|ูุง ุจุฑุญ|ูุฅุฐ|ูููุง|ุจู|ูุฃูู|ุฃุฑุจุน|ุจูุง|ุฅุฐูุง|ูุฏ|ุงุฑุจุนูู|ุฃู|ุชุณุนุฉ|ุฃูุชู|ุชูููุง|ููุฑู|ุณุงุก|ุฏ|ุฃูุถุง|ุนุฌุจุง|ููุง|ููุณ|ููุง|ุฃุตูุง|ุฅูููู|ููุฐุงูู|ุจูู|ูููุง|ูุฌุฏ|ุฐุง|ูุน|ุฃูุดู|ุฎุงุตุฉ|ุฃูุชูุจุฑ|ูุฅุฐุง|ูุนููู|ุฅุฐุงู|ุฐูุง|ุฐููู|ุฅุฐุง|ุฅูุง|ุจ|ุฌูุนุฉ|ุขูุง|ููุณ|ุฃูู|ููู|ุนููู|ูุง|ููุณุช|ุฌููููุฉ|ูุงุชู|ูุง|ุงูุชู|ุซ|ูุฐุง|ูุฅุฐุง|ุซูุงููุฉ|ุซูููุฆุฉ|ุงูููุงุชู|ุฃูู|ุทุฑุง|ุณุชุฉ|ูุง|ุทุงู|ุซูุงุซูู|ูุฏู|ุฃูุชูุง|ูุงุชู|ุตุฑุงุญุฉ|ููุง|ููุจ|ุทูู|ูุงุก|ููููุง|ุฐููู|ุฃุจู|ููุง|ุจูุฏ|ุตุงุฑ|ุชุณุน|ูู|ููููุจุฑ|ุณุจุนูู|ุจุนุฏ|ูุฃููู|ุฅูุงูุง|ููุฎ|ูู|ุนุงุฏ|ุนุดุฑูู|ูุงุฏุงู|ุช|ุฎูุณุฉ|ููุงุชูู|ุถ|ุชุดุฑูู|ุฅูููู|ูุงุก|ูุณุงุก|ุฐุงูู|ุ

In [20]:
data['tweet']=data['tweet'].replace(arab_stopwords, '')
data['tweet'].tail(10)

4912     ุงููุจูุงูููู ุจูุญูู ุจูุงูู ูู ุนุฑุจู ุชุทูุฑ ุงูููุถูุน ูุต...
14252    ุงูุช ูุณุฌูู ุญุถููููุฑูู ูุงุณ ูุงูุช ูุชุงุจุนุงูููุชููุชุฑ ูุบ...
19437    ูุด ูุดููุฑู ูุนูุงู ุงุฎุทู ููุนุฏู ูู ุงูุช ูุฏููู ูููุง ุณ...
12784                     ูุฏู ุงุฎุชูุชู ุชุงุน ุงูุฌูุงุฌูู ุชุงุน ุงูุตุญ
4128                                   ุงููู ูุง ุฑุจ ูุณูุน ููู
4055         ุนู ููุง ูุจุทู ููุฑู ูุนู ุงุดู ุจุงูุฏููุง ุดุนูุฑ ุงูุชูุณุญู
13921    ุงูุง ุชูุฑูุจุง ุจุฌุฏ ูุงููู ููุช ูู ูุชุฑ ุงูููุฏ ูุงูู ุฑูุญ...
9909                   ูุตููู ููุงุชุฑ ุงุฏุนูููุง ุงููู ูุฌุฒุงูู ุฎูุฑ
15006    ุจุนุฏ ููุงูู ุงูููุฑูุงุชู ุจุณุงูู ุนู ุฑุงูู ุจุตูู ุงููููุงู...
2993     ูุณููู ูุงููู ุฏุฎูุช ุนูู ุงูุฑุฏูุฏ ุนูุฏู ูุงุดูุช ุฑุฏ ุณูุนู...
Name: tweet, dtype: object

## Data Preparation 

### Tokenization 
Coverting sentences to vector of sentence words 

### Stemming is a process of converting every word to its root of meaning 
 This process will help the model gain the dialect of each country and attitude of writing on twitter for easier classification

In [21]:
# small example of stemming

from nltk.stem.isri import ISRIStemmer
st = ISRIStemmer()

token = word_tokenize(data['tweet'][15])
print(token)
string = []

for i in token:
    stemmed = st.stem(i)
    string.append(stemmed)
    fstring = ' '.join(string)
print(fstring)

['ูุณุนุฏ', 'ูุณุงู', 'ุจูุช', 'ุงูุนู']
ุณุนุฏ ุณุงู ุจูุช ูุนู


In [22]:
# Applying same process on all dataset

for a in range(len(data['tweet'])):
    token = word_tokenize(data['tweet'][a])
    print(token)
    string = []
    for i in token:
        stemmed = st.stem(i)
        string.append(stemmed)
    fstring = ' '.join(string)
    data['tweet'][a] = fstring
    print(data['tweet'][a])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['ุดููู', 'ูููุชูุง', 'ุดููู', 'ูุฎุฌูุช', 'ูููุง', 'ุงููุง', 'ุงูุง', 'ูุง', 'ุจุฏูุช', 'ุงู', 'ููุงู', 'ูุนุงูู', 'ูููุช', 'ุญูุงุฑ', 'ุฌุงุฑู', 'ุจุทุฑููู', 'ูุณุชูุฒู', 'ุฌุฏุง', 'ูุฏุฎูุช', 'ููู', 'ูุซุงููุง', 'ุงูุง', 'ูุง', 'ุดุชูุชู', 'ููุง', 'ูุฑู', 'ุจุงูุฑุบู', 'ุงูู', 'ุดุชูุชูู', 'ูููุชู', 'ููููู', 'ุงุฏุจ', 'ููู', 'ุงูุญูุฏููู', 'ุชุฑุจูุชู', 'ููุนุชูู', 'ุงุฌุงุฑูู', 'ูู', 'ุงูุงุณุงุกู', 'ูููุช', 'ุจุฑุฏ', 'ุนูู', 'ุชุบุฑูุฏุงุชู', 'ููุท']
ุดูู ููุช ุดูู ุฎุฌู ูููุง ุงูู ุงูุง ูุง ุจุฏุช ุงู ููู ูุนุง ููุช ุญูุฑ ุฌุฑู ุทุฑู ูุฒู ุฌุฏุง ุฏุฎู ููู ูุซู ุงูุง ูุง ุดุชู ููุง ูุฑู ุฑุบู ุงูู ุดุชู ููู ููู ุงุฏุจ ููู ุญูุฏูู ุฑุจุช ูุนุช ุฌุฑู ูู ุณุกู ููู ุจุฑุฏ ุนูู ุชุบุฑูุฏุงุชู ููุท
['ููููุจุฏูุง', 'ุฏู', 'ูุง', 'ูุงูุง', 'ูุด', 'ู

In [23]:
data.head(5)

Unnamed: 0,id,dialect,tweet
10531,1132446517996998784,SA,ุงูู ุตูุญ ููุช ุซุฑู ุฒู ุฎูู ูุชุจ ุงุซุฑ ุนููู
12916,1015690276344619008,DZ,ุณุญุจ ููุงุงุงุงูู ุงูู ุนูู ูุง ุงุจู ุดุนุนุนุฑ ุทูููููู
18942,1098503953204105216,AE,ุงูุง ูู ูุงุณ ุงูู ุดุฑู ูุนุจ
20498,1027083124545646592,BH,ูุฒู ูุตุฑ ุดู ุญูู ุงูููู ูุงู ุงุฑุฎ ูุง ุชุทู
10175,950283002726240384,SA,ุงูู ูุนู ุงูู ูู ููู ููุจ ุนูู


## Word Embedding is conversion into a numerical format where each tweet is represented by a matrix (word vectors)

Term Frequency โ Inverse document frequency (TF-IDF) is an embedding technique that is intended to reflect how important a word is to a document  The tfโidf value increases proportionally to the number of times a word appears in the document 



Term Frequency (TF) = (Frequency of a term in the document)/(Total number of terms in documents)
Inverse Document Frequency(IDF) = log( (total number of documents)/(number of documents with term t))

TF-IDF = (TF).(IDF)

## N-Gram Concept
n-grams is a sequence of N items from a given example of text to get the highest probablities of followed sequence of words can form a specific dialect or class 

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer                                                 # min. n & max. n
unigram_word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', ngram_range=(1, 1), max_features=10000)

In [25]:
unigramdata = unigram_word_vectorizer.fit_transform(data['tweet'].astype('str'))
unigramdata = unigramdata.toarray()

vocab = unigram_word_vectorizer.get_feature_names()
unigrams = pd.DataFrame(np.round(unigramdata, 1), columns=vocab)
unigrams[unigrams > 0] = 1

unigrams.head()

Unnamed: 0,ุกุจุฑ,ุกุจุฑุฑุฑ,ุกุจุฑุฑุฑุฑ,ุกุจุด,ุกุจุดู,ุกุซุฑ,ุกุฎุฑ,ุกุฏุจ,ุกุณุณ,ุกุด,...,ููุฒ,ููุณ,ููุด,ููุถ,ููุบ,ููู,ููู,ููู,ููู,ููู
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
import joblib

joblib.dump(unigram_word_vectorizer, 'vectorizer_feature.pkl')

['vectorizer_feature.pkl']

In [None]:
unigrams.to_csv('/content/drive/MyDrive/unigrams_features.csv')

## Another Word Embedding Technique
CountVectorizer is converting a collection of text documents to a matrix of word counts.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words= stopwords.words("arabic"),ngram_range=(2, 2), max_features = 10000)
X = count_vectorizer.fit_transform(data['tweet']).toarray()
df_bigrams = pd.DataFrame(X, columns=count_vectorizer.get_feature_names())
df_bigrams.shape

(20833, 10000)

In [None]:
df_bigrams.to_csv('/content/drive/MyDrive/bigrams_features.csv')

In [None]:
v = CountVectorizer(ngram_range=(2, 2))
print(v.fit(["an apple a day keeps the doctor away"]).vocabulary_)

{'an apple': 0, 'apple day': 1, 'day keeps': 2, 'keeps the': 4, 'the doctor': 5, 'doctor away': 3}


In [None]:
count_vectorizer = CountVectorizer(stop_words= stopwords.words("arabic"),ngram_range=(3, 3), max_features = 10000)
X = count_vectorizer.fit_transform(data['tweet']).toarray()
df_trigram = pd.DataFrame(X, columns=count_vectorizer.get_feature_names())
df_trigram.shape

(20833, 10000)

In [None]:
df_trigram.to_csv('/content/drive/MyDrive/trigrams_features.csv')

## Label Encoding 
converting classes into numbers 

In [None]:
from sklearn import preprocessing

pro = preprocessing.LabelEncoder()
encpro = pro.fit_transform(data['dialect'])
data['dialect'] = encpro
y = data['dialect']

In [None]:
y.to_csv('/content/drive/MyDrive/labels.csv')

In [None]:
y.shape

(20833,)

### Text Sequence 
vectorizing all tweets as sequence of features and making the highest existing words in corpus having higher value 

### Padding Sequence
framming the vector by zeros to make all sentences vectors of same length

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text,sequence

X=data['tweet'].values

max_features=10000
x_tokenizer=Tokenizer(max_features)
x_tokenizer.fit_on_texts(X)
X = x_tokenizer.texts_to_sequences(X) 
X = sequence.pad_sequences(X)

In [None]:
X.shape

(20833, 58)

In [None]:
seq_data = pd.DataFrame(X)
seq_data.to_csv('/content/drive/MyDrive/seq_data.csv')