In [7]:
# 1. Contoh data cleaning
# A. Missing values
import pandas as pd
import numpy as np

# Membuat dataframe contoh
data = {
    'Name': ['Jimmy', 'Arif', 'Rudi', 'Maulana', 'Taufiq', np.nan],
    'Age': [20, 21, 21, 22, np.nan, 50],
    'City': ['Sago', 'Painan', 'Lumpo', 'Bayang', 'Airpura', 'Los Angeles']
}

df = pd.DataFrame(data)

# Menampilkan data awal
print("Data Awal:")
print(df)

# Menghapus duplikat
df = df.drop_duplicates()

# Menangani missing values dengan mengisi nilai median untuk kolom numerik
df['Age'] = df['Age'].fillna(df['Age'].median())

# Menghapus baris yang mengandung missing values di kolom 'Name'
df = df.dropna(subset=['Name'])

# Menampilkan data setelah cleaning
print("\nData Setelah Cleaning:")
print(df)

Data Awal:
      Name   Age         City
0    Jimmy  20.0         Sago
1     Arif  21.0       Painan
2     Rudi  21.0        Lumpo
3  Maulana  22.0       Bayang
4   Taufiq   NaN      Airpura
5      NaN  50.0  Los Angeles

Data Setelah Cleaning:
      Name   Age     City
0    Jimmy  20.0     Sago
1     Arif  21.0   Painan
2     Rudi  21.0    Lumpo
3  Maulana  22.0   Bayang
4   Taufiq  21.0  Airpura


In [8]:
# 1. Contoh data cleaning
# B. Meghapus Duplikat
import pandas as pd
import numpy as np

# Membuat dataframe contoh
data = {
    'Name': ['Jimmy', 'Arif', 'Rudi', 'Maulana', 'Taufiq', np.nan],
    'Age': [20, 21, 21, 22, np.nan, 50],
    'City': ['Sago', 'Painan', 'Lumpo', 'Bayang', 'Airpura', 'Los Angeles']
}

df = pd.DataFrame(data)

# Menampilkan data awal
print("Data Awal:")
print(df)

# Identifikasi duplikat
duplicate_count = (df.duplicated().sum())
print(f"\nJumlah data yang terduplikat: {duplicate_count}")
# Menghapus duplikat
df.drop_duplicates(inplace=True)

# Menampilkan data setelah cleaning
print("\nData Setelah Cleaning:")
print(df)
duplicate_count = (df.duplicated().sum())
print(f"\nJumlah data yang terduplikat: {duplicate_count}")

Data Awal:
      Name   Age         City
0    Jimmy  20.0         Sago
1     Arif  21.0       Painan
2     Rudi  21.0        Lumpo
3  Maulana  22.0       Bayang
4   Taufiq   NaN      Airpura
5      NaN  50.0  Los Angeles

Jumlah data yang terduplikat: 0

Data Setelah Cleaning:
      Name   Age         City
0    Jimmy  20.0         Sago
1     Arif  21.0       Painan
2     Rudi  21.0        Lumpo
3  Maulana  22.0       Bayang
4   Taufiq   NaN      Airpura
5      NaN  50.0  Los Angeles

Jumlah data yang terduplikat: 0


In [9]:
# 1. Contoh data cleansing
# C. Menangani Outliers

# Langkah 1: Membuat DataFrame
import pandas as pd
import numpy as np
data = {
'Name': ['Jimmy', 'Arif', 'Rudi', 'Maulana', 'Taufiq', np.nan],
    'Age': [20, 21, 21, 22, np.nan, 50],
    'City': ['Sago', 'Painan', 'Lumpo', 'Bayang', 'Airpura', 'Los Angeles']
}
df = pd.DataFrame(data)
# Langkah 2: Identifikasi Outliers
# Outliers dapat diidentifikasi menggunakan berbagai metode, salah satu yang umum adalah menggunakan Z-score atau IQR (Interquartile Range).
# Di sini, kita akan menggunakan IQR untuk mendeteksi outliers.
# Menghitung IQR
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
# Menentukan batas bawah dan atas untuk outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Menandai outliers
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
print("Outliers:\n", outliers)
# Langkah 3: Menangani Outliers
# # Menghapus outliers
df_no_outliers = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]
# Mengganti outliers dengan nilai median
median_age = df['Age'].median()
df['Age'] = np.where((df['Age'] < lower_bound) | (df['Age'] > upper_bound),
median_age, df['Age'])
# Mengisi missing values dengan median
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Name'].fillna('Unknown', inplace=True)

Outliers:
   Name   Age         City
5  NaN  50.0  Los Angeles


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)


In [11]:
# 1. Contoh data Cleansing
# D. Mengubah data Kategorikal ke Numerik
# Contoh: misalnya, kita memiliki dataset dengan kolom "City" yang berisi data kategorikal:
import pandas as pd
data = {'City': ['Sago', 'Painan', 'Lumpo', 'Bayang', 'Airpura', 'Los Angeles']}
df = pd.DataFrame(data)
# One-Hot Encoding
df_one_hot = pd.get_dummies(df, columns=['City'])
print(df_one_hot)

   City_Airpura  City_Bayang  City_Los Angeles  City_Lumpo  City_Painan  \
0         False        False             False       False        False   
1         False        False             False       False         True   
2         False        False             False        True        False   
3         False         True             False       False        False   
4          True        False             False       False        False   
5         False        False              True       False        False   

   City_Sago  
0       True  
1      False  
2      False  
3      False  
4      False  
5      False  


In [12]:
# 1. Contoh data Cleansing
# E. Menangani Data tidak Valid

import pandas as pd
import numpy as np
data = {
'Name': ['Jimmy', 'Arif', 'Rudi', 'Maulana', 'Taufiq', np.nan],
    'Age': [20, 21, 21, 22, np.nan, 50],
    'City': ['Sago', 'Painan', 'Lumpo', 'Bayang', 'Airpura', 'Los Angeles']
}
df = pd.DataFrame(data)
# Identifikasi nilai tidak valid

print("Data Awal:")
print(df)
# Mengubah nilai 'Age' yang tidak valid menjadi NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Mengisi nilai 'Age' yang hilang atau tidak valid dengan median
df['Age'] = df['Age'].fillna(df['Age'].median())
# Mengisi nilai 'Name' yang hilang dengan 'Unknown'
df['Name'] = df['Name'].fillna('Unknown')
print("\nData Setelah Menangani Nilai Tidak Valid:")
print(df)

Data Awal:
      Name   Age         City
0    Jimmy  20.0         Sago
1     Arif  21.0       Painan
2     Rudi  21.0        Lumpo
3  Maulana  22.0       Bayang
4   Taufiq   NaN      Airpura
5      NaN  50.0  Los Angeles

Data Setelah Menangani Nilai Tidak Valid:
      Name   Age         City
0    Jimmy  20.0         Sago
1     Arif  21.0       Painan
2     Rudi  21.0        Lumpo
3  Maulana  22.0       Bayang
4   Taufiq  21.0      Airpura
5  Unknown  50.0  Los Angeles


In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Contoh dataset
data = {
    'Age': [20, 21, 21, 22, np.nan, 50],  # Pastikan ada np.nan agar bisa ditangani
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000]
}

df = pd.DataFrame(data)

# Menangani missing values pada 'Age' dengan median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Min-Max Scaling
min_max_scaler = MinMaxScaler()
df_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)

# Z-Score Normalization (Standardization)
standard_scaler = StandardScaler()
df_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

# Menampilkan hasil
print("Data Asli:")
print(df)

print("\nMin-Max Scaled Data:")
print(df_min_max_scaled)

print("\nStandardized Data:")
print(df_standard_scaled)


Data Asli:
    Age  Salary
0  20.0   50000
1  21.0   60000
2  21.0   70000
3  22.0   80000
4  21.0   90000
5  50.0  100000

Min-Max Scaled Data:
        Age  Salary
0  0.000000     0.0
1  0.033333     0.2
2  0.033333     0.4
3  0.066667     0.6
4  0.033333     0.8
5  1.000000     1.0

Standardized Data:
        Age   Salary
0 -0.538972 -1.46385
1 -0.446577 -0.87831
2 -0.446577 -0.29277
3 -0.354182  0.29277
4 -0.446577  0.87831
5  2.232884  1.46385


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [15]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
import numpy as np
# Contoh dataset
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([2, 3, 4, 5, 6])
# Model
model = LinearRegression()
# K-Fold Cross-Validation
kf = KFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=kf)
print("K-Fold Cross-Validation Scores:", scores)
print("Mean Score:", np.mean(scores))

K-Fold Cross-Validation Scores: [nan nan nan nan nan]
Mean Score: nan




In [17]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Model Logistic Regression
model = LogisticRegression(max_iter=200)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds dengan shuffle dan random state 42
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Menampilkan hasil
print("K-Fold Cross-Validation Scores:", scores)
print("Mean Score:", scores.mean())


K-Fold Cross-Validation Scores: [1.         1.         0.93333333 0.96666667 0.96666667]
Mean Score: 0.9733333333333334


In [18]:
import numpy as np

# Data
data = np.array([10, 15, 8, 12, 14, 20, 18, 16, 11, 13])

# Bootstrap sampling
n_samples = 1000
bootstrap_means = np.empty(n_samples)

for i in range(n_samples):
    bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
    bootstrap_means[i] = np.mean(bootstrap_sample)

# Confidence interval (95%)
ci_lower = np.percentile(bootstrap_means, 2.5)
ci_upper = np.percentile(bootstrap_means, 97.5)

# Menampilkan hasil
print(f"Mean: {np.mean(data):.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f} - {ci_upper:.2f})")


Mean: 13.70
95% Confidence Interval: (11.50 - 15.80)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Bagi dataset menjadi data pelatihan dan data pengujian (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi dan latih model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prediksi dengan data pengujian
y_pred = model.predict(X_test)

# Evaluasi kinerja model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Menampilkan hasil
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 1.00
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [None]:
# Contoh Confusion Matrix di NLP

# Impor library yang diperlukan
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report

# Unduh dataset sentimen ulasan film dari NLTK
nltk.download('movie_reviews')

# Ambil ulasan dan label dari dataset
documents = [
    (" ".join(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]

# Pisahkan teks ulasan dan label
texts, labels = zip(*documents)

# Ubah teks menjadi vektor fitur TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)

# Bagi dataset menjadi data pelatihan dan data pengujian (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Inisialisasi dan latih model klasifikasi (Linear SVM)
classifier = LinearSVC(dual=False)  # Mengatasi warning jika dataset besar
classifier.fit(X_train, y_train)

# Prediksi kelas pada data pengujian
y_pred = classifier.predict(X_test)

# Evaluasi model menggunakan confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Evaluasi model menggunakan classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", report)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
