In [1]:
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')



# Download Data dengan menggunakan gdown

In [2]:
import gdown

file_url = 'https://drive.google.com/u/3/uc?id=1IW5DV9sW0FIkD_EJXzZkqpuwZ5sGXA2y&export=download'
output_file = 'train.csv'

gdown.download(file_url, output_file, quiet=False)

df = pd.read_csv('train.csv')

Downloading...
From: https://drive.google.com/u/3/uc?id=1IW5DV9sW0FIkD_EJXzZkqpuwZ5sGXA2y&export=download
To: e:\1_D\Ristek_Datathon\Penyisihan\Notebook\Notebook Kumpul\train.csv
100%|██████████| 24.7M/24.7M [01:33<00:00, 264kB/s]


In [3]:
file_url = 'https://drive.google.com/u/3/uc?id=19ZfNNAELrkmEyPVNDYKc59tSQeukkIUz&export=download'
output_file = 'test.csv'

gdown.download(file_url, output_file, quiet=False)

test = pd.read_csv('test.csv')

Downloading...
From: https://drive.google.com/u/3/uc?id=19ZfNNAELrkmEyPVNDYKc59tSQeukkIUz&export=download
To: e:\1_D\Ristek_Datathon\Penyisihan\Notebook\Notebook Kumpul\test.csv
100%|██████████| 7.80M/7.80M [00:20<00:00, 390kB/s]


# Memisahkan data menjadi per bagian jalan

In [13]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [14]:
# membuat fitur untuk mendapatkan bagian jalan
df['id_bagian_jalan'] = [str(df.id_jalan[i]) + '_' + str(df.id_titik_mulai[i]) + '_' + str(df.id_titik_akhir[i]) for i in range(len(df))]
test['id_bagian_jalan'] = [str(test.id_jalan[i]) + '_' + str(test.id_titik_mulai[i]) + '_' + str(test.id_titik_akhir[i]) for i in range(len(test))]
df.drop(['id_jalan', 'id_titik_mulai', 'id_titik_akhir'], axis=1, inplace=True)
test.drop(['id_jalan', 'id_titik_mulai', 'id_titik_akhir'], axis=1, inplace=True)

In [15]:
# memisahkan dataset sebanyak 934 dataset tergantung id_bagian_jalannya (jumlah bagian jalan yang terdapat dalam data)
try:
    os.mkdir('df_per_bagian_jalan')
except FileExistsError:
    pass

for i in range(len(df.id_bagian_jalan.unique())):
    id_bagian_jalan_unique = df.id_bagian_jalan.unique()[i]
    df_temp = df[df.id_bagian_jalan == id_bagian_jalan_unique]
    df_temp.to_csv('df_per_bagian_jalan/df'+str(id_bagian_jalan_unique)+'.csv', index=False)

try:
    os.mkdir('data_test_per_id_bagian_jalan')
except FileExistsError:
    pass

for i in range(len(test.id_bagian_jalan.unique())):
    id_bagian_jalan_unique = test.id_bagian_jalan.unique()[i]
    test_temp = test[test.id_bagian_jalan == id_bagian_jalan_unique]
    test_temp.to_csv('data_test_per_id_bagian_jalan/'+str(id_bagian_jalan_unique)+'.csv', index=False)

# Melakukan interpolasi data pada data train

In [16]:
# membuat datafram yang berisi keseluruhan unique value dari 'waktu_setempat'
df_waktu = pd.DataFrame({'waktu_setempat': df.waktu_setempat.unique()})
df_waktu

Unnamed: 0,waktu_setempat
0,2020-02-01 01:00:00+00:00
1,2020-02-01 02:00:00+00:00
2,2020-02-01 03:00:00+00:00
3,2020-02-01 04:00:00+00:00
4,2020-02-01 05:00:00+00:00
...,...
522,2020-02-22 19:00:00+00:00
523,2020-02-22 20:00:00+00:00
524,2020-02-22 21:00:00+00:00
525,2020-02-22 22:00:00+00:00


In [17]:
try:
    os.mkdir('sudah_interpolasi_df_id jalan_id titik mulai_id titik akhir_csv/')
except FileExistsError:
    pass

direktori = 'df_per_bagian_jalan/'

daftar_file = os.listdir(direktori)

for nama_file in daftar_file:
    df_Nan = pd.read_csv(direktori + nama_file)
    df_belum_interpolasi = df_waktu.join(df_Nan.set_index('waktu_setempat'), on='waktu_setempat')
    if (df_belum_interpolasi.rerata_kecepatan.isna().sum() == 0):
        df_belum_interpolasi.to_csv('sudah_interpolasi_df_id jalan_id titik mulai_id titik akhir_csv/sudah_interpolasi_' + nama_file, index=False)
    else:
        df_belum_interpolasi.id_bagian_jalan.fillna(df_belum_interpolasi.id_bagian_jalan.unique()[0], inplace=True)
        x = np.array(df_belum_interpolasi.index[df_belum_interpolasi.isna().any(axis=1)])
        xp = np.array(df_belum_interpolasi.index[df_belum_interpolasi.notna().all(axis=1)])
        yp = np.array(df_belum_interpolasi.loc[df_belum_interpolasi['rerata_kecepatan'].notna(), 'rerata_kecepatan'])
        hasil_interpolasi = np.interp(x, xp, yp)
        for i in range(len(hasil_interpolasi)):
            df_belum_interpolasi['rerata_kecepatan'][x[i]] = hasil_interpolasi[i]
        df_belum_interpolasi.to_csv('sudah_interpolasi_df_id jalan_id titik mulai_id titik akhir_csv/sudah_interpolasi_' + nama_file, index=False)

# Melakukan prediksi dengan model terbaik

In [18]:
try:
    os.mkdir('df_hasil_prediksi_RF/')
except FileExistsError:
    pass

direktori = 'data_test_per_id_bagian_jalan/'

daftar_file = os.listdir(direktori)

total_id = []
total_forecast = []


for nama_file in daftar_file:
    test = pd.read_csv(direktori + nama_file)
    df = pd.read_csv('sudah_interpolasi_df_id jalan_id titik mulai_id titik akhir_csv/sudah_interpolasi_df' + nama_file)

    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    test['tanggal'] = test['waktu_setempat'].str[8:10]
    test['jam'] = test['waktu_setempat'].str[11:13]

    test['tanggal'] = test['tanggal'].astype('int64')
    test['jam'] = test['jam'].astype('int64')

    #change date to the name of the day using mod
    day_dict = {
        0: 'Friday',
        1: 'Saturday',
        2: 'Sunday',
        3: 'Monday',
        4: 'Tuesday',
        5: 'Wednesday',
        6: 'Thursday',
    }

    test['hari'] = test['tanggal'].map(lambda x: day_dict[x % 7])
    test['weekend'] = test['hari'].map(lambda x: 1 if x == 'Saturday' or x == 'Sunday' else 0)

    df['tanggal'] = df['waktu_setempat'].str[8:10]
    df['jam'] = df['waktu_setempat'].str[11:13]

    df['tanggal'] = df['tanggal'].astype('int64')
    df['jam'] = df['jam'].astype('int64')

    df['hari'] = df['tanggal'].map(lambda x: day_dict[x % 7])
    df['weekend'] = df['hari'].map(lambda x: 1 if x == 'Saturday' or x == 'Sunday' else 0)
    
    mean_per_hour = []
    kelas_kelancaran = []
    for i in range(0,24):
        mean_per_hour.append(df[df['jam']==i]['rerata_kecepatan'].mean())

    percentile_25 = np.percentile(mean_per_hour,25)
    percentile_50 = np.percentile(mean_per_hour,50)
    percentile_75 = np.percentile(mean_per_hour,75)

    for i in range(len(mean_per_hour)):
        if mean_per_hour[i] < percentile_25:
            kelas_kelancaran.append(0)
        elif mean_per_hour[i] < percentile_50:
            kelas_kelancaran.append(1)
        elif mean_per_hour[i] < percentile_75:
            kelas_kelancaran.append(2)
        else:
            kelas_kelancaran.append(3)

    df['kelas_kelancaran'] = 0
    for i in range(len(df)):
        df['kelas_kelancaran'][i] = kelas_kelancaran[df['jam'][i]]

    test['kelas_kelancaran'] = 0
    for i in range(len(test)):
        test['kelas_kelancaran'][i] = kelas_kelancaran[test['jam'][i]]

    test = test.drop(['waktu_setempat',  'tanggal', 'id_bagian_jalan'], axis=1)
    test = pd.get_dummies(test, columns=['jam', 'hari'])

    df = df.drop(['waktu_setempat',  'tanggal', 'id_bagian_jalan'], axis=1)
    df = pd.get_dummies(df, columns=['jam', 'hari'])
    
    # menyamakan kolom pada train dan test
    y = df['rerata_kecepatan']
    X = df.drop(['rerata_kecepatan'], axis=1)
    X = X[test.columns]

    reg = RandomForestRegressor(random_state=42)
    reg.fit(X, y)

    forecast = reg.predict(test)

    # If you want to convert the predictions into a DataFrame:
    forecast_df= pd.DataFrame({'id':test_id, 'rerata_kecepatan':forecast.flatten().tolist()})

    total_id = total_id + test_id.tolist()
    total_forecast = total_forecast + forecast.flatten().tolist()

    forecast_df.to_csv('df_hasil_prediksi_RF/hasil_prediksi_RF_' + nama_file, index=False)
forecast_total_df= pd.DataFrame({'id':total_id, 'rerata_kecepatan':total_forecast})
forecast_total_df = forecast_total_df.sort_values(by=['id'])
forecast_total_df.to_csv('submission_Bintang_Riset_SC.csv', index=False)