In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df1 = pd.read_csv("dummy_user.csv")
df1.head()

Unnamed: 0,User ID,nama,jenis_kelamin,pekerjaan,usia,kota,Rating,Car Make,Price
0,1,John Doe,Laki-laki,Dokter,30.0,Jakarta,8.5,BMW M4,800000000.0
1,2,Jane Smith,Perempuan,Pengacara,35.0,Surabaya,7.8,Porsche 911,1000000000.0
2,3,Michael Johnson,Laki-laki,Pengusaha,40.0,Bandung,8.9,Audi R8,1500000000.0
3,4,Emily Brown,Perempuan,Pengajar,28.0,Medan,7.2,Ford Mustang,600000000.0
4,5,David Wilson,Laki-laki,Insinyur,45.0,Yogyakarta,9.3,Chevrolet Corvette,2000000000.0


In [3]:
df2 = pd.read_csv("list_cars2.csv")
df2.head()

Unnamed: 0,Car Make,Car Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0


In [4]:
# Gabungkan Dataset
merged_df = pd.concat([df1, df2])
print("Merged DataFrame secara Vertikal:")
print(merged_df)

Merged DataFrame secara Vertikal:
     User ID             nama jenis_kelamin  pekerjaan  usia        kota  \
0          1         John Doe     Laki-laki     Dokter  30.0     Jakarta   
1          2       Jane Smith     Perempuan  Pengacara  35.0    Surabaya   
2          3  Michael Johnson     Laki-laki  Pengusaha  40.0     Bandung   
3          4      Emily Brown     Perempuan   Pengajar  28.0       Medan   
4          5     David Wilson     Laki-laki   Insinyur  45.0  Yogyakarta   
...      ...              ...           ...        ...   ...         ...   
2054     NaN              NaN           NaN        NaN   NaN         NaN   
2055     NaN              NaN           NaN        NaN   NaN         NaN   
2056     NaN              NaN           NaN        NaN   NaN         NaN   
2057     NaN              NaN           NaN        NaN   NaN         NaN   
2058     NaN              NaN           NaN        NaN   NaN         NaN   

      Rating            Car Make         Price  \
0  

In [5]:
# Info tentang dataset, termasuk tipe data dan non-null counts
print("Informasi dataset:")
print(merged_df.info())

# Statistik deskriptif untuk setiap kolom numerik
print("\nStatistik deskriptif untuk kolom numerik:")
print(merged_df.describe())

# Cetak daftar kolom dalam dataset
print("\nDaftar kolom dalam dataset:")
print(merged_df.columns)

Informasi dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 2160 entries, 0 to 2058
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   User ID             101 non-null    object 
 1   nama                100 non-null    object 
 2   jenis_kelamin       100 non-null    object 
 3   pekerjaan           100 non-null    object 
 4   usia                100 non-null    float64
 5   kota                100 non-null    object 
 6   Rating              100 non-null    float64
 7   Car Make            2159 non-null   object 
 8   Price               2159 non-null   float64
 9   Car Model           2059 non-null   object 
 10  Year                2059 non-null   float64
 11  Kilometer           2059 non-null   float64
 12  Fuel Type           2059 non-null   object 
 13  Transmission        2059 non-null   object 
 14  Location            2059 non-null   object 
 15  Color               2059 non-null   objec

In [6]:
# i) Mencari dan Memproses Data yang Hilang
missing_data = merged_df.isnull().sum()
print("Data yang Hilang:\n", missing_data)

# ii) Nilai Data Campuran/Tipe Data yang Tidak Cocok
mixed_data = merged_df.applymap(type).nunique()
print("\nTipe Data Campuran:\n", mixed_data)

# iii) Pencarian Pencilan pada Data
# Menghitung z-score untuk kolom numerik
numeric_cols = merged_df.select_dtypes(include=['int', 'float']).columns
z_scores = (merged_df[numeric_cols] - merged_df[numeric_cols].mean()) / merged_df[numeric_cols].std()

# Menandai baris yang memiliki nilai z-score > 3 atau < -3 sebagai outlier
outliers = merged_df[(z_scores > 3) | (z_scores < -3)]
print("\nData Pencilan:\n", outliers)

Data yang Hilang:
 User ID               2059
nama                  2060
jenis_kelamin         2060
pekerjaan             2060
usia                  2060
kota                  2060
Rating                2060
Car Make                 1
Price                    1
Car Model              101
Year                   101
Kilometer              101
Fuel Type              101
Transmission           101
Location               101
Color                  101
Owner                  101
Seller Type            101
Engine                 181
Max Power              181
Max Torque             181
Drivetrain             237
Length                 165
Width                  165
Height                 165
Seating Capacity       165
Fuel Tank Capacity     214
dtype: int64

Tipe Data Campuran:
 User ID               2
nama                  2
jenis_kelamin         2
pekerjaan             2
usia                  1
kota                  2
Rating                1
Car Make              2
Price                 1
C

In [7]:
# Collaborative Filtering
# Buat User-Item Matrix
user_item_matrix = merged_df.pivot_table(index='User ID', columns='Car Make', values='Rating', fill_value=0)

In [8]:
# Hitung Similarity antar pengguna
user_similarity = cosine_similarity(user_item_matrix, user_item_matrix)

In [9]:
# Content-Based Filtering
# Combine information from multiple columns into one text column
merged_df['Car Features'] = merged_df['Car Make'] + ' ' + merged_df['Car Model'] + ' ' + merged_df['Year'].astype(str) + ' ' + merged_df['Color'] + ' ' + merged_df['Transmission'] + ' ' + merged_df['Fuel Type']

# Initialize TF-IDF vectorizer
# Ekstraksi Fitur
tfidf = TfidfVectorizer()

# Fit and transform the text data
car_features_tfidf = tfidf.fit_transform(merged_df['Car Features'].astype(str))

# Print feature names (words in the vocabulary)
print("Vocabulary:", tfidf.get_feature_names_out())

Vocabulary: ['0d' '0l' '110' '143' '143bhp' '16' '177bhp' '180' '1988' '1996' '1l'
 '200' '2000' '2002' '2004' '2006' '2007' '2008' '2009' '200d' '2010'
 '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '20d' '220' '220d' '250' '250xl' '280' '2l' '2wd' '30'
 '300' '300d' '300h' '30d' '320d' '320i' '330i' '35' '350' '350d' '40'
 '400' '400d' '43' '45' '450' '488' '4matic' '4wd' '4x2' '4x4' '500'
 '500l' '520d' '525d' '530d' '530i' '55' '560' '5l' '610' '630d' '6s'
 '718' '730ld' '75' '800' '85' '8s' '8v' '911' '99' '9l' 'a3' 'a4' 'a6'
 'a7' 'a8' 'abs' 'ac' 'accent' 'accord' 'active' 'activity' 'adventure'
 'ags' 'airbag' 'alcazar' 'alloy' 'allspace' 'alpha' 'altis' 'alto'
 'altroz' 'alturas' 'amaze' 'ambiente' 'ambition' 'ameo' 'amg' 'amt'
 'anniversary' 'aqua' 'aspire' 'asta' 'astor' 'at' 'audi' 'aura'
 'automatic' 'avantgarde' 'avn' 'awd' 'ax' 'b180' 'baleno' 'base' 'beat'
 'beige' 'benz' 'bhp' 'bl' 'black' 'blue' 'blueefficiency' 'bmw' 'bolero'
 'b

In [10]:
# Hitung Similarity antar mobil
car_similarity = cosine_similarity(car_features_tfidf, car_features_tfidf)

In [11]:
# Hybrid Recommendation
def hybrid_recommendation(user_id):
    if user_id not in user_item_matrix.index:
        print("User ID", user_id, "tidak ditemukan dalam indeks DataFrame.")
        return None
    
    # Collaborative Filtering
    collab_recs = user_similarity[user_id] @ user_item_matrix.values
    
    # Content-Based Filtering
    content_recs = car_similarity @ user_item_matrix.loc[user_id]
    
    # Gabungkan dan beri bobot
    hybrid_recs = 0.5 * collab_recs + 0.5 * content_recs
    
    # Sort dan return rekomendasi
    return hybrid_recs.argsort()[-10:][::-1]


In [12]:
# Contoh Penggunaan
user_id = 1
recommendations = hybrid_recommendation(user_id)
if recommendations is not None:
    print("Rekomendasi untuk User", user_id, ":", recommendations)

User ID 1 tidak ditemukan dalam indeks DataFrame.
