In [1]:
from google.colab import auth
auth.authenticate_user()
!git config --global user.email "rastiaulia08@gmail.com"
!git config --global user.name "rastiauliaanggraini"
!git clone https://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@github.com/IET-Polinela/supervised-learning-rastiauliaanggraini.git

Cloning into 'supervised-learning-rastiauliaanggraini'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 15 (delta 2), reused 13 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (15/15), 269.58 KiB | 5.39 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [2]:
%cd /content/supervised-learning-rastiauliaanggraini/

/content/supervised-learning-rastiauliaanggraini


In [18]:
%%writefile supervised.py
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Tentukan folder visualizations dan path absolut
visualization_dir = os.path.join(os.getcwd(), 'visualizations')

# Pastikan folder visualizations ada, jika tidak, buat folder tersebut
if not os.path.exists(visualization_dir):
    try:
        os.makedirs(visualization_dir)
        print(f"Folder '{visualization_dir}' berhasil dibuat.")
    except Exception as e:
        print(f"Error membuat folder '{visualization_dir}': {e}")
else:
    print(f"Folder '{visualization_dir}' sudah ada.")

# Load dataset
file_path = "train.csv"  # Sesuaikan dengan lokasi file

df = pd.read_csv(file_path)

# 1. Data Understanding
numeric_cols = df.select_dtypes(include=[np.number]).columns  # Hanya kolom numerik
numeric_stats = df[numeric_cols].describe().T
numeric_stats["median"] = df[numeric_cols].median()
numeric_stats = numeric_stats[["count", "mean", "median", "std", "min", "25%", "50%", "75%", "max"]]
print(numeric_stats)

# Save Data Understanding visualization
plt.figure(figsize=(10, 6))

# Menggunakan hanya kolom numerik untuk korelasi
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.savefig(os.path.join(visualization_dir, "data_understanding_heatmap.png"))
plt.close()

# 2. Data Preprocessing
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Handle NaN values
    label_encoders[col] = le

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

# Menggunakan SimpleImputer untuk menangani nilai NaN
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)  # Pastikan X tetap menjadi DataFrame

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Outlier Handling
# Boxplot untuk semua fitur numerik
plt.figure(figsize=(15, 8))
X.select_dtypes(include=['number']).boxplot(rot=90, grid=False)
plt.title("Boxplot dari Semua Fitur Numerik")
plt.xticks(rotation=90)
plt.savefig(os.path.join(visualization_dir, "boxplot_fitur_numerik.png"))  # Menyimpan visualisasi boxplot
plt.close()

# Metode IQR untuk menangani outlier
Q1 = X_train.quantile(0.25)  # Pastikan X_train tetap DataFrame
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1
X_train_no_outliers = X_train[~((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)]
y_train_no_outliers = y_train.loc[X_train_no_outliers.index]

# 4. Feature Scaling
scalers = {"StandardScaler": StandardScaler(), "MinMaxScaler": MinMaxScaler()}
scaled_data = {}
for name, scaler in scalers.items():
    scaler.fit(X_train_no_outliers)
    scaled_data[name] = scaler.transform(X_train_no_outliers)
    plt.hist(scaled_data[name], bins=50, alpha=0.5, label=name)
plt.legend()
plt.title("Distribusi Data Sebelum dan Sesudah Scaling")
plt.savefig(os.path.join(visualization_dir, "distribusi_data_scaling.png"))  # Menyimpan distribusi data
plt.close()

# 5. Implementation: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Save Linear Regression results visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, alpha=0.5)
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Linear Regression: Predicted vs Actual")
plt.savefig(os.path.join(visualization_dir, "linear_regression_results.png"))
plt.close()

# 6. Implementation: Polynomial Regression
poly_degrees = [2, 3]
results_poly = {}
for d in poly_degrees:
    poly = PolynomialFeatures(degree=d)
    X_poly_train = poly.fit_transform(X_train_no_outliers)
    X_poly_test = poly.transform(X_test)
    lr_poly = LinearRegression()
    lr_poly.fit(X_poly_train, y_train_no_outliers)
    y_pred_poly = lr_poly.predict(X_poly_test)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)
    results_poly[d] = {"MSE": mse_poly, "R2": r2_poly}

# Save Polynomial Regression results visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_poly, alpha=0.5, label="Polynomial Regression")
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Polynomial Regression: Predicted vs Actual")
plt.savefig(os.path.join(visualization_dir, "polynomial_regression_results.png"))
plt.close()

# 7. Implementation: KNN Regression
results_knn = {}
knn_preds = {}  # Menyimpan hasil prediksi untuk setiap K
for k in [3, 5, 7]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train_no_outliers, y_train_no_outliers)
    y_pred_knn = knn.predict(X_test)
    knn_preds[k] = y_pred_knn  # Simpan hasil prediksi KNN untuk setiap k
    mse_knn = mean_squared_error(y_test, y_pred_knn)
    r2_knn = r2_score(y_test, y_pred_knn)
    results_knn[k] = {"MSE": mse_knn, "R2": r2_knn}

# Save KNN Regression results visualization
plt.figure(figsize=(10, 6))
for k in [3, 5, 7]:
    plt.scatter(y_test, knn_preds[k], alpha=0.5, label=f"KNN Regression (K={k})")
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("KNN Regression: Predicted vs Actual")
plt.legend()
plt.savefig(os.path.join(visualization_dir, "knn_regression_results.png"))
plt.close()

# 8. Analysis Comparison Models and Conclusion
# Tabel perbandingan MSE dan R2
comparison_df = pd.DataFrame({
    "Model": ["Linear Regression"] + [f"Polynomial Regression (Degree {d})" for d in poly_degrees] + [f"KNN Regression (K={k})" for k in [3, 5, 7]],
    "MSE": [mse_lr] + [results_poly[d]["MSE"] for d in poly_degrees] + [results_knn[k]["MSE"] for k in [3, 5, 7]],
    "R2": [r2_lr] + [results_poly[d]["R2"] for d in poly_degrees] + [results_knn[k]["R2"] for k in [3, 5, 7]]
})

# Save Comparison visualization
plt.figure(figsize=(10, 6))
sns.barplot(x="Model", y="MSE", data=comparison_df)
plt.title("Model Comparison - MSE")
plt.xticks(rotation=45)
plt.savefig(os.path.join(visualization_dir, "model_comparison_mse.png"))
plt.close()


Writing supervised.py


In [13]:
import shutil
import os

# Tentukan folder visualizations
visualization_dir = os.path.join(os.getcwd(), 'visualizations')

# Menghapus folder beserta seluruh isinya
if os.path.exists(visualization_dir):
    shutil.rmtree(visualization_dir)
    print(f"Folder '{visualization_dir}' telah dihapus.")
else:
    print(f"Folder '{visualization_dir}' tidak ditemukan.")


Folder '/content/supervised-learning-rastiauliaanggraini/visualizations' telah dihapus.


In [17]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Tentukan folder visualizations dan path absolut
visualization_dir = os.path.join(os.getcwd(), 'visualizations')

# Pastikan folder visualizations ada, jika tidak, buat folder tersebut
if not os.path.exists(visualization_dir):
    try:
        os.makedirs(visualization_dir)
        print(f"Folder '{visualization_dir}' berhasil dibuat.")
    except Exception as e:
        print(f"Error membuat folder '{visualization_dir}': {e}")
else:
    print(f"Folder '{visualization_dir}' sudah ada.")

# Load dataset
file_path = "train.csv"  # Sesuaikan dengan lokasi file

df = pd.read_csv(file_path)

# 1. Data Understanding
numeric_cols = df.select_dtypes(include=[np.number]).columns  # Hanya kolom numerik
numeric_stats = df[numeric_cols].describe().T
numeric_stats["median"] = df[numeric_cols].median()
numeric_stats = numeric_stats[["count", "mean", "median", "std", "min", "25%", "50%", "75%", "max"]]
print(numeric_stats)

# Save Data Understanding visualization
plt.figure(figsize=(10, 6))

# Menggunakan hanya kolom numerik untuk korelasi
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.savefig(os.path.join(visualization_dir, "data_understanding_heatmap.png"))
plt.close()

# 2. Data Preprocessing
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Handle NaN values
    label_encoders[col] = le

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

# Menggunakan SimpleImputer untuk menangani nilai NaN
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)  # Pastikan X tetap menjadi DataFrame

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Outlier Handling
# Boxplot untuk semua fitur numerik
plt.figure(figsize=(15, 8))
X.select_dtypes(include=['number']).boxplot(rot=90, grid=False)
plt.title("Boxplot dari Semua Fitur Numerik")
plt.xticks(rotation=90)
plt.savefig(os.path.join(visualization_dir, "boxplot_fitur_numerik.png"))  # Menyimpan visualisasi boxplot
plt.close()

# Metode IQR untuk menangani outlier
Q1 = X_train.quantile(0.25)  # Pastikan X_train tetap DataFrame
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1
X_train_no_outliers = X_train[~((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)]
y_train_no_outliers = y_train.loc[X_train_no_outliers.index]

# 4. Feature Scaling
scalers = {"StandardScaler": StandardScaler(), "MinMaxScaler": MinMaxScaler()}
scaled_data = {}
for name, scaler in scalers.items():
    scaler.fit(X_train_no_outliers)
    scaled_data[name] = scaler.transform(X_train_no_outliers)
    plt.hist(scaled_data[name], bins=50, alpha=0.5, label=name)
plt.legend()
plt.title("Distribusi Data Sebelum dan Sesudah Scaling")
plt.savefig(os.path.join(visualization_dir, "distribusi_data_scaling.png"))  # Menyimpan distribusi data
plt.close()

# 5. Implementation: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Save Linear Regression results visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, alpha=0.5)
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Linear Regression: Predicted vs Actual")
plt.savefig(os.path.join(visualization_dir, "linear_regression_results.png"))
plt.close()

# 6. Implementation: Polynomial Regression
poly_degrees = [2, 3]
results_poly = {}
for d in poly_degrees:
    poly = PolynomialFeatures(degree=d)
    X_poly_train = poly.fit_transform(X_train_no_outliers)
    X_poly_test = poly.transform(X_test)
    lr_poly = LinearRegression()
    lr_poly.fit(X_poly_train, y_train_no_outliers)
    y_pred_poly = lr_poly.predict(X_poly_test)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)
    results_poly[d] = {"MSE": mse_poly, "R2": r2_poly}

# Save Polynomial Regression results visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_poly, alpha=0.5, label="Polynomial Regression")
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Polynomial Regression: Predicted vs Actual")
plt.savefig(os.path.join(visualization_dir, "polynomial_regression_results.png"))
plt.close()

# 7. Implementation: KNN Regression
results_knn = {}
knn_preds = {}  # Menyimpan hasil prediksi untuk setiap K
for k in [3, 5, 7]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train_no_outliers, y_train_no_outliers)
    y_pred_knn = knn.predict(X_test)
    knn_preds[k] = y_pred_knn  # Simpan hasil prediksi KNN untuk setiap k
    mse_knn = mean_squared_error(y_test, y_pred_knn)
    r2_knn = r2_score(y_test, y_pred_knn)
    results_knn[k] = {"MSE": mse_knn, "R2": r2_knn}

# Save KNN Regression results visualization
plt.figure(figsize=(10, 6))
for k in [3, 5, 7]:
    plt.scatter(y_test, knn_preds[k], alpha=0.5, label=f"KNN Regression (K={k})")
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("KNN Regression: Predicted vs Actual")
plt.legend()
plt.savefig(os.path.join(visualization_dir, "knn_regression_results.png"))
plt.close()

# 8. Analysis Comparison Models and Conclusion
# Tabel perbandingan MSE dan R2
comparison_df = pd.DataFrame({
    "Model": ["Linear Regression"] + [f"Polynomial Regression (Degree {d})" for d in poly_degrees] + [f"KNN Regression (K={k})" for k in [3, 5, 7]],
    "MSE": [mse_lr] + [results_poly[d]["MSE"] for d in poly_degrees] + [results_knn[k]["MSE"] for k in [3, 5, 7]],
    "R2": [r2_lr] + [results_poly[d]["R2"] for d in poly_degrees] + [results_knn[k]["R2"] for k in [3, 5, 7]]
})

# Save Comparison visualization
plt.figure(figsize=(10, 6))
sns.barplot(x="Model", y="MSE", data=comparison_df)
plt.title("Model Comparison - MSE")
plt.xticks(rotation=45)
plt.savefig(os.path.join(visualization_dir, "model_comparison_mse.png"))
plt.close()


Folder '/content/supervised-learning-rastiauliaanggraini/visualizations' sudah ada.
                count           mean    median           std      min  \
Id             1460.0     730.500000     730.5    421.610009      1.0   
MSSubClass     1460.0      56.897260      50.0     42.300571     20.0   
LotFrontage    1201.0      70.049958      69.0     24.284752     21.0   
LotArea        1460.0   10516.828082    9478.5   9981.264932   1300.0   
OverallQual    1460.0       6.099315       6.0      1.382997      1.0   
OverallCond    1460.0       5.575342       5.0      1.112799      1.0   
YearBuilt      1460.0    1971.267808    1973.0     30.202904   1872.0   
YearRemodAdd   1460.0    1984.865753    1994.0     20.645407   1950.0   
MasVnrArea     1452.0     103.685262       0.0    181.066207      0.0   
BsmtFinSF1     1460.0     443.639726     383.5    456.098091      0.0   
BsmtFinSF2     1460.0      46.549315       0.0    161.319273      0.0   
BsmtUnfSF      1460.0     567.240411    

In [20]:
!git init
!git add .
!git commit -m "Memperbaiki Visualisasi"
!git push -u origin main

Reinitialized existing Git repository in /content/supervised-learning-rastiauliaanggraini/.git/
[main 09e8ed6] Memperbaiki Visualisasi
 11 files changed, 168 insertions(+), 1 deletion(-)
 delete mode 100644 Lab4_23758022.ipynb
 create mode 100644 supervised.py
 create mode 100644 visualizations/boxplot_fitur_numerik.png
 create mode 100644 visualizations/comparison_table.png
 create mode 100644 visualizations/data_understanding_heatmap.png
 create mode 100644 visualizations/distribusi_data_scaling.png
 create mode 100644 visualizations/knn_regression_results.png
 create mode 100644 visualizations/linear_regression_results.png
 create mode 100644 visualizations/model_comparison_mse.png
 create mode 100644 visualizations/polynomial_regression_results.png
 create mode 100644 visualizations/prediksi_vs_aktual.png
Enumerating objects: 14, done.
Counting objects: 100% (14/14), done.
Delta compression using up to 2 threads
Compressing objects: 100% (13/13), done.
Writing objects: 100% (13/13)

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/My Drive/Colab Notebooks/Lab4_23758022.ipynb" "/content/intro-to-colab-rastiauliaanggraini/"