# Cleaning Data Stuff

In [99]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from googletrans import Translator

In [100]:
x_data = pd.read_csv('datax.csv')
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27480 entries, 0 to 27479
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  27479 non-null  object
 1   category   27480 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [101]:
review_data = pd.read_csv('../complaints.csv')
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  669 non-null    object
 1   category   669 non-null    object
dtypes: object(2)
memory usage: 10.6+ KB


In [102]:
review_data = review_data.drop_duplicates()

In [103]:
review_data = review_data.dropna()

In [104]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 0 to 656
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   complaint  642 non-null    object
 1   category   642 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [105]:
x_data = x_data.drop_duplicates()

In [106]:
x_data = x_data.dropna()

In [107]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [108]:
def clean_complaint(complaint):
    # Remove usernames after '@'
    complaint = re.sub(r'@\w+', '', complaint)
    # Remove URLs
    complaint = re.sub(r'http\S+', '', complaint)
    # Remove punctuation and special symbols (commas, periods, etc.)
    complaint = re.sub(r'[^\w\s]', '', complaint)
    # Remove leading whitespace (if any)
    complaint = complaint.lstrip()
    # Remove extra whitespace
    complaint = re.sub(r'\s+', ' ', complaint).strip()
    return complaint

In [111]:
x_data['complaint'] = x_data['complaint'].apply(clean_complaint)

In [112]:
x_data.iloc[0,0]

'di sisi lain istriku selalu seneng lihat hujan karena pas kecil pernah ngalamin kekeringan parah yg ampe air aja dijatah dan harus ngantri kalo musim hujannya mundur suka gak tenang karena keinget'

In [113]:
review_data['complaint'] = review_data['complaint'].apply(clean_complaint)

In [114]:
review_data.iloc[0,0]

'as of 23 april 2019 di gedung utamanya dari dulu terkenal sbg tempat perkawinan bergengsi di foto terlampir adalah kegiatan kordinasi nasional khusus nya antara esdm dan dishut dalam upaya revegetasi hutan dan das akibat tambang bagus lah'

In [115]:
stop_words = set(stopwords.words("indonesian"))
stemmer = PorterStemmer()
translator = Translator()

In [116]:
def remove_stopwords(complaint):
     # Tokenize the text
    tokens = word_tokenize(complaint)
    
    # Remove stopwords and apply stemming
    cleaned_tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    
    # Join the translated tokens back into a single string
    return " ".join(cleaned_tokens)

In [117]:
x_data['complaint'] = x_data['complaint'].apply(remove_stopwords)

In [118]:
review_data['complaint'] = review_data['complaint'].apply(remove_stopwords)

In [119]:
x_data.iloc[26695,0]

'ava kpop pemuja cwo ngondek melambai denial kalo jefri nichol ga ganteng kebanyakan kena limbah plasti mata lu the one beauti priveleg is real manusia pemikirannya fluktuatif berubah ubah ditambah muda'

In [120]:
# Merge the two DataFrames
merged_data = pd.concat([x_data, review_data], ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_data.to_csv('merged_data.csv', index=False)