# **Data Cleaning**

### **Import all Datasets**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv("gemastik14.csv")
df.sample(10)

Unnamed: 0.1,Unnamed: 0,conversation_id,date,tweet,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
527,527,1.13E+018,2019-05-31 11:51:21,Mudiknya hari H dong pas lebarannya Enak mac...,,,,,,,,,,,,,
959,959,1.29355663747145E+018,2020-08-12 21:35:10,Yang jelas adalah berkurangnya ruang terbuka h...,,,,,,,,,,,,,
1424,1424,1.12578637903045E+018,2019-05-07 22:36:03,Dibikin dulu kota nya baru ditempatin orangnya...,,,,,,,,,,,,,
2108,2108,"1,24E+18",20/03/2020 19:07,Kalo ada kutu gua taroin hama aja sekalian ni ...,,,,,,,,,,,,,
2245,2245,"1,17E+18",19/09/2019 18:18,Cepat datanglah musim penghujan Biar tersiram ...,,,,,,,,,,,,,
781,781,1.28E+018,2020-07-09 7:04:41,Skema spt ini belum juga di jamah di indo kit...,,,,,,,,,,,,,
1370,1370,1.15142687056017E+018,2019-07-17 16:42:13,belum dari masalah ke per buruh an konflik la...,,,,,,,,,,,,,
337,337,1.14E+018,2019-06-18 16:27:35,Kampung nenekku Desa Lengkong tadinya bnyk saw...,,,,,,,,,,,,,
1072,1072,1.0428381703651E+018,2018-09-21 1:09:29,Hari tani di peringati lahirnya UUPA no tahu...,,,,,,,,,,,,,
1967,1967,"1,28E+18",25/06/2020 02:26,time intern bulan tu aku jumpa ramai orang ...,,,,,,,,,,,,,


In [4]:
data = df[["conversation_id","date","tweet"]]

In [5]:
data.head()

Unnamed: 0,conversation_id,date,tweet
0,1.41e+18,2021-06-24 8:41:03,Di gua udh ga ada sawah digusur sentraland se...
1,1.41e+18,2021-06-22 1:05:52,teriak harga properti mahal akhirnya sawah d...
2,1.41e+18,2021-06-18 8:41:56,Gunungnya udah dikeruk Sawah udah jadi pabrik...
3,1.41e+18,2021-06-17 20:55:28,Semua sawah akan digusur dinggo mbangun dalan ...
4,1.41e+18,2021-06-17 19:41:33,Terpikir jg klo fenomena skrg proyek bangunan ...


In [4]:
# Import all datasets
slangwords = pd.read_csv("all_slangwords.csv")
stopwords = pd.read_csv("stopwords.csv")

In [5]:
slangwords.sample(5)

Unnamed: 0,old,new
4111,ssemangatpagi,semangat pagi
12497,dinihari,dini hari
8605,astaghfirullohal,astagfirullahal
15132,mempermain,mempermainkan
15031,mbangun,membangun


In [6]:
stopwords.sample(5)

Unnamed: 0,0
545,yang
148,sebelum
61,menanti-nanti
589,bagaimanakah
688,kasus


In [7]:
stopwords.fillna("",inplace=True)
slangwords.fillna("",inplace=True)

In [8]:
list_stopwords = stopwords["0"].values.tolist()
list_stopwords[:10]

['jumlahnya',
 'jelaslah',
 'secara',
 'sejak',
 'kita',
 'benarkah',
 'benarlah',
 'segala',
 'olehnya',
 'mungkin']

In [9]:
# Create dict of slangwords
list_old = slangwords["old"]
list_new = slangwords["new"]
zip_iterator = zip(list_old, list_new)
dict_slang = dict(zip_iterator)

In [10]:
dict_slang.get("org")

'orang'

### **Text Cleaning**

#### Phase 1 : remove hashtags, @user, mentions, RT, URL links

In [11]:
import re
def phase1(text) :
    res = re.sub("@\\S+", "", text) ## Remove @user
    res = re.sub("#\\S+", "", res) ## Remove hashtags
    res = re.sub("RT", "", res) ## RT
    res = re.sub(r"http\S+", "", res) ## Remove URL
    return res

#### Phase 2 : remove escape strings

In [12]:
def phase2(text):
    res = re.sub(r"\n|\t|\f|\r", ' ', text).strip()
    return res

#### Phase 3 : remove ascii / unicode substring

In [13]:
def phase3(text) :
    string_encode = text.encode("ascii", "ignore").decode()
    return string_encode

#### Phase 4 : remove amp and extra whitespace

In [14]:
def phase4(text) :
    res = text.lower()
    res = re.sub("&amp;", "", res)
    res = re.sub(" amp "," ",res)
    res = re.sub(' +', ' ', res)
    res = res.strip()
    return res

#### Phase 5 : Change slang word

In [15]:
def phase5(text) :
    words = text.split()
    review = [dict_slang.get(word) if word in list(dict_slang.keys()) else word for word in words]
    review = ' '.join(review)
    return review

#### Phase 6 : Remove all symbols

In [16]:
def phase6(text) :
    res = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "",text)
    res = ''.join([i for i in res if not i.isdigit()])
    words = res.split()
    review = [dict_slang.get(word) if word in list(dict_slang.keys()) else word for word in words]
    review = ' '.join(review)
    return review

#### Phase 7 : Remove words without vowel. ex : wkwkwk

In [17]:
vowel = ['a','e','i','o','u']
def phase7(text) : 
    vowset = set(vowel)
    list_word =  text.split()
    temp = [word for word in list_word if vowset.intersection(word)]
    temp = [word for word in temp if word not in vowel]
    res = ' '.join(temp)
    return res

#### Phase 8 : Remove stopwords

In [18]:
def phase8(text) :
    ## Remove Stopwords
    words = text.split()
    res = [word for word in words if not word in list_stopwords]
    res = " ".join(res)
    return res

#### Apply all phase

In [19]:
def all_phase(text) :
    res = phase1(text)
    res = phase2(res)
    res = phase3(res)
    res = phase4(res)
    res = phase5(res)
    res = phase6(res)
    res = phase7(res)
    res = phase8(res)
    return res

In [20]:
coba = data.tweet[1132]
coba

'Kritik yang lebih baik itu mungkin penyusutan lahan pertanian  Itu coba dicek'

In [21]:
res = all_phase(coba)
res

'kritik penyusutan lahan pertanian coba dicek'

In [22]:
corpus = []
for i in range(0, len(data)) :
    review = data["tweet"][i]
    result = all_phase(review)
    corpus.append(result)

In [23]:
data["tweet"] = corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet"] = corpus


In [24]:
data.drop_duplicates(subset=["tweet"],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(subset=["tweet"],inplace=True)


In [25]:
data.to_csv("testing.csv",index=False)