<a href="https://colab.research.google.com/github/Hamza-t/Language-Identification/blob/main/Data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Hello! 
In this notebook, we will clean text in our data file.
steps : 
1.   Delete duplicate rows and Nan values in labels column.
2.   Change the type of data (text column must be string and label colmn must be integer
3.   Clean text data from : URL, emojis, punctuation (?,:!..) , symbols, newlines and Tabs. : Example : To know more about this website: https://Hamza.example.com
4.   Remove Accented Characters. : é, à, ...
5.   Reduce repeated characters. : eyyyyyy (mean "yes") ==> ey
6.   Remove Whitespaces : "How are you doing ?" Case Conversion : str.lower()


In [None]:
#Import data file 
#drive mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#verify the path
folder_path = "/content/drive/MyDrive/My_work/NLP-Project-INSAT"
!ls "/content/drive/MyDrive/My_work/NLP-Project-INSAT"

'Data augmentation'	  'data collection '	     'General Data'
 Data_Augmentation.ipynb   Data_Preparation.ipynb     prepared_data.csv
 Data_cleaning.ipynb	   Data_visualization.ipynb


In [None]:
##assign data path 
data_path = folder_path + "/" + "prepared_data.csv"

In [None]:
#read data
import pandas as pd
data = pd.read_csv(data_path)
data = data[["text","label"]]
data.head()

Unnamed: 0,text,label
0,ana fil ghorba w sout el athan ahla haja 3ijbi...,3.0
1,man eheb rasoule mohamed,3.0
2,rabi yerhmou sofiane cha3ri,3.0
3,rabi yarahmak ya sbou3i wou yarham kol om mita,3.0
4,merci beaucoup salime,1.0


In [None]:
#nan values in label column
data["label"].unique()

array([ 3.,  1.,  4.,  2., nan,  0.])

In [None]:
#nan values in text column 
print(len(data[data['text'].isna()==True])) #==> our data is complete

len(data[data['label'].isna()==True]) #==> our label is incomplete

0


2

In [None]:
#Cheking the values of nan values 
from numpy import NaN
data[(data['label']!=0.)&(data['label']!=1.) & (data['label']!=2.) & (data['label']!=3.) & (data['label']!=4.)]

Unnamed: 0,text,label
666,oui bravo kol chay mfabrek wadhe7,
5676,wlh m3lm,


In [None]:
#we can fix thim by adding a label of tunizi langage
data.at[666, 'label'] = 3.
data.at[5676, 'label'] = 3.

In [None]:
#cheking the change
len(data[data['label'].isna()==True]) #==> our label is incomplete

0

In [None]:
#let's check the data dtype
data.dtypes

text      object
label    float64
dtype: object

In [None]:
#Change the text data to string and the label data to int 
data['label'] = data['label'].astype('int')

In [None]:
display(data.dtypes)  
type(data['text'][0]) # ==> the text column can't be changed to string because of diffrents lengths of text, it's not a problem!

text     object
label     int64
dtype: object

str

In [None]:
#check the duplicate rows && Nan values in labels column
#duplicate rows 
dup_data = data[data.duplicated()]
print("number of duplicate rows: ", dup_data.shape)
print(dup_data)

number of duplicate rows:  (446, 2)
          text  label
166       waaw      3
168      bravo      1
238     bravoo      1
240      bravo      1
259     bravoo      1
...        ...    ...
8217        hh      3
8219  good job      2
8224     bravo      1
8250     bravo      1
8255  sa7a bro      4

[446 rows x 2 columns]


* There is 446 duplicate rows : this amount of data represent around 5% of all data, so let's keep them

###Text cleaning 

In [None]:
#import library
import re
import string
import unicodedata
from itertools import groupby

In [None]:
#remove emails
def remove_emails(x):
     return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x)

#remove urls
def remove_urls(x):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x)

#remove rt
def remove_rt(x):
    return re.sub(r'\brt\b', '', x).strip()

In [None]:
#remove punctuation
def remove_special_chars(x):
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x

#remove accented chars
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

#remove extra space 
def remove_space(x):
  return re.sub(' +', ' ', x)

In [None]:
#lower text
def lower_text(text):
    return text.lower()

#Reduce repeated characters
def reshape_words(text):
    words = text.split()
    for word in words:
        i = words.index(word)
        chars = [ch for ch in words[i]]
        chars = [x[0] for x in groupby(chars)]
        words[i] = "".join(chars)
    return " ".join(words)

In [None]:
#Functions to apply :
# remove_urls, remove_rt, remove_emails, remove_special_chars, , remove_space, lower_text

In [None]:
data["text"] = data.text.map(lower_text)
data["text"] = data.text.map(remove_urls)
data["text"] = data.text.map(remove_rt)
data["text"] = data.text.map(remove_emails)
data["text"] = data.text.map(remove_special_chars)
data["text"] = data.text.map(remove_space)

In [None]:
data.head()

Unnamed: 0,text,label
0,ana fil ghorba w sout el athan ahla haja 3ijbi...,3
1,man eheb rasoule mohamed,3
2,rabi yerhmou sofiane cha3ri,3
3,rabi yarahmak ya sbou3i wou yarham kol om mita,3
4,merci beaucoup salime,1


In [None]:
#save data to .csv file
from pathlib import Path  
filepath = Path('/content/drive/MyDrive/My_work/NLP-Project-INSAT/cleaned_data.csv') 
data.to_csv(filepath)