# Cleaning Data Stuff

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
x_data = pd.read_csv('datax.csv')
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27480 entries, 0 to 27479
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  27479 non-null  object
 1   category   27480 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [3]:
review_data = pd.read_csv('../complaints.csv')
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  669 non-null    object
 1   category   669 non-null    object
dtypes: object(2)
memory usage: 10.6+ KB


In [4]:
review_data = review_data.drop_duplicates()

In [5]:
review_data = review_data.dropna()

In [6]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 0 to 656
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  642 non-null    object
 1   category   642 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [7]:
x_data = x_data.drop_duplicates()

In [8]:
x_data = x_data.dropna()

In [9]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def clean_complaint(complaint):
    # Remove usernames after '@'
    complaint = re.sub(r'@\w+', '', complaint)
    # Remove URLs
    complaint = re.sub(r'http\S+', '', complaint)
    # Remove punctuation and special symbols (commas, periods, etc.)
    complaint = re.sub(r'[^\w\s]', '', complaint)
    # Remove extra whitespace
    complaint = re.sub(r'\s+', ' ', complaint).strip()
    # Remove any zero-width characters like 'ㅤ'
    complaint = re.sub(r'[\u200B-\u200D\uFEFF\u3164]+', '', complaint)
    return complaint

In [11]:
x_data['complaint'] = x_data['complaint'].apply(clean_complaint)

In [12]:
x_data.iloc[0,0]

'di sisi lain istriku selalu seneng lihat hujan karena pas kecil pernah ngalamin kekeringan parah yg ampe air aja dijatah dan harus ngantri kalo musim hujannya mundur suka gak tenang karena keinget'

In [13]:
review_data['complaint'] = review_data['complaint'].apply(clean_complaint)

In [14]:
review_data.iloc[0,0]

'as of 23 april 2019 di gedung utamanya dari dulu terkenal sbg tempat perkawinan bergengsi di foto terlampir adalah kegiatan kordinasi nasional khusus nya antara esdm dan dishut dalam upaya revegetasi hutan dan das akibat tambang bagus lah'

In [33]:
stop_words = set(stopwords.words("indonesian"))

In [34]:
factory = StemmerFactory()
lemmatizer = factory.create_stemmer()

In [35]:
def remove_stopwords(complaint):
    complaint = complaint.lower()
     #Tokenize and remove stop words
    words = [word for word in complaint.split() if word not in stop_words]
    # Apply lemmatization
    lemmatized_words = [lemmatizer.stem(word) for word in words]
    return ' '.join(lemmatized_words)

In [None]:
x_data['complaint'] = x_data['complaint'].apply(remove_stopwords)

In [None]:
review_data['complaint'] = review_data['complaint'].apply(remove_stopwords)

In [15]:
# Merge the two DataFrames
merged_data = pd.concat([x_data, review_data], ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_data.to_csv('merged_data.csv', index=False)

In [16]:
merged_data.loc[(merged_data['category'] == 'jalan rusak')|(merged_data['category'] == 'macet'), 'category'] = 'fasilitas umum'

In [17]:
merged_data.loc[(merged_data['category'] == 'others'), 'category'] = 'lainnya'

In [18]:
merged_data.loc[(merged_data['category'] == 'tercemar')|(merged_data['category'] == 'pencemaran')|(merged_data['category'] == 'asap')|(merged_data['category'] == 'air'), 'category'] = 'polusi'

In [19]:
merged_data = merged_data.drop_duplicates()

In [20]:
merged_data = merged_data.dropna()

In [21]:
# Define a regex pattern to keep rows that contain mainly alphabetic characters and spaces
pattern = r'^[a-zA-Z\s]+$'

# Filter out rows that don't match this pattern
merged_data = merged_data[merged_data['complaint'].str.match(pattern, na=False)]

In [22]:
merged_data.to_csv('merged_data.csv', index=False)

In [23]:
# List of keywords that identify 'layanan' complaints
keywords = ['pungli', 'service', 'biaya', 'dokumen', 'izin', 'servis', 'pelayanan', 'layanan', 'dibenahi', 'dibereskan', 'perhatikan', 'server', 'lapor', 'laporan']  

# Update 'category' if 'lainnya' contains any keywords
merged_data.loc[
    (merged_data['category'] == 'lainnya') & 
    (merged_data['complaint'].str.contains('|'.join(keywords), case=False)),
    'category'
] = 'layanan'

In [24]:
merged_data = merged_data[~(merged_data['complaint'].str.len() < 50)]

In [25]:
# List of keywords that identify 'lingkungan' complaints
keywords = ['banjir', 'kemarau', 'hujan', 'angin', 'tanah', 'kebakaran']  

# Update 'category' if 'lainnya' contains any keywords
merged_data.loc[
    (merged_data['category'] == 'lainnya') & 
    (merged_data['complaint'].str.contains('|'.join(keywords), case=False)),
    'category'
] = 'lingkungan'

In [26]:
# List of keywords that identify 'kekeringan' complaints
keywords = ['kering', 'tandus', 'gersang']  

# Update 'category' if 'lainnya' contains any keywords
merged_data.loc[
    (merged_data['category'] == 'lainnya') & 
    (merged_data['complaint'].str.contains('|'.join(keywords), case=False)),
    'category'
] = 'lingkungan'

In [27]:
# List of keywords that identify 'sampah' complaints
keywords = ['tpa', 'plastik']  

# Update 'category' if 'lainnya' contains any keywords
merged_data.loc[
    (merged_data['category'] == 'lainnya') & 
    (merged_data['complaint'].str.contains('|'.join(keywords), case=False)),
    'category'
] = 'sampah'

In [28]:
merged_data['category'].value_counts().reset_index()

Unnamed: 0,category,count
0,polusi,3190
1,lingkungan,2692
2,fasilitas umum,2474
3,lainnya,2206
4,sampah,1574
5,hutan,1129
6,limbah,295
7,layanan,136
8,kekeringan,94


In [29]:
keywords = ['prabowo', 'jokowi', 'gibran', 'wowo', 'owi', 'pramono', 'ronald tanur', 'ivan', 'anies', 'anis', 'ahok', 'gerindra', 'greenflag', 'leceh'
           ,'pedo', 'baju', 'masak', 'kebahagiaan', 'penipuan', 'love', 'cinta', 'lgbt', 'stylish', 'harga', 'densu', 'kpop', 'wibu', 'mental',
           'extrovert', 'introvert', 'fitnah', 'semangka', 'paul', 'post', 'edit', 'napi', 'blow', 'lagu', 'liga', 'baper', 'fess', 'base'
           ,'psi', 'kartika', 'nasi', 'pecel']
merged_data = merged_data[~merged_data['complaint'].str.contains('|'.join(keywords), case=False, na=False)]

In [30]:
merged_data.to_csv('merged_data.csv', index=False)

In [35]:
# Sample 1000 rows from merged_data
train_data = merged_data.sample(n=1000, random_state=32)

In [36]:
train_data['is_complaint'] = 0

In [37]:
train_data = train_data[['complaint','is_complaint']]

In [38]:
train_data.to_csv('../spam_training_data.csv', index=False)

In [39]:
merged_data['category'].value_counts().reset_index()

Unnamed: 0,category,count
0,polusi,2770
1,lingkungan,2352
2,fasilitas umum,2229
3,lainnya,1954
4,sampah,1379
5,hutan,1032
6,limbah,267
7,layanan,122
8,kekeringan,88
