## Import Library

In [331]:
import pandas as pd

## Import Dataset

In [332]:
df = pd.read_csv('data/df_electronics.csv')
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0
3,0,3,1.0,1999-07-01,Female,Portable Audio & Video,,1999,,0
4,0,4,2.0,1999-07-06,Female,Portable Audio & Video,,1999,,0


## Data Engineering

In [333]:
df.shape

(1292954, 10)

In [334]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292954 entries, 0 to 1292953
Data columns (total 10 columns):
item_id       1292954 non-null int64
user_id       1292954 non-null int64
rating        1292954 non-null float64
timestamp     1292954 non-null object
model_attr    1292954 non-null object
category      1292954 non-null object
brand         331120 non-null object
year          1292954 non-null int64
user_attr     174124 non-null object
split         1292954 non-null int64
dtypes: float64(1), int64(4), object(5)
memory usage: 98.6+ MB


In [335]:
df.describe()

Unnamed: 0,item_id,user_id,rating,year,split
count,1292954.0,1292954.0,1292954.0,1292954.0,1292954.0
mean,4183.588,560512.7,4.051482,2012.938,0.1747587
std,2525.346,334237.4,1.379732,2.643513,0.550681
min,0.0,0.0,1.0,1999.0,0.0
25%,2018.0,269748.2,4.0,2012.0,0.0
50%,3930.0,551892.0,5.0,2014.0,0.0
75%,6289.0,847485.8,5.0,2015.0,0.0
max,9559.0,1157632.0,5.0,2018.0,2.0


In [336]:
df.isnull().sum()

item_id             0
user_id             0
rating              0
timestamp           0
model_attr          0
category            0
brand          961834
year                0
user_attr     1118830
split               0
dtype: int64

In [337]:
df['item_id'].value_counts().sort_values(ascending=False)

7780    28530
2031     9393
1575     8622
2279     6278
2486     5810
        ...  
1529        1
4253        1
9559        1
7810        1
3972        1
Name: item_id, Length: 9560, dtype: int64

## Membuat kolom count dan mean dari sebuah rating

In [338]:
df1 = df.groupby('item_id')['rating'].agg(['count','mean'])
df1 = df1.sort_values('count', ascending=False)

In [339]:
df1.head()

Unnamed: 0_level_0,count,mean
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7780,28530,4.400911
2031,9393,4.418184
1575,8622,4.218743
2279,6278,4.168844
2486,5810,4.306368


### menghapus kolom yang tidak berguna dan banyak missing valuenya. disini kita tidak menghapus kolom brand di karenakan kolom tersebut sangat penting

In [340]:
df.drop(columns=['timestamp', 'user_attr', 'split', 'model_attr'], inplace=True)

### Menggabungkan kolom dari count dan mean kedalam dataframe awal kita

In [341]:
df = pd.concat([df, df1], join = 'outer', axis = 1).sort_values('count', ascending=False)
df.head(10)

Unnamed: 0,item_id,user_id,rating,category,brand,year,count,mean
7780,372,7509,4.0,Camera & Photo,Philips,2006,28530.0,4.400911
2031,119,1963,5.0,Computers & Accessories,Toshiba,2004,9393.0,4.418184
1575,73,1530,5.0,Camera & Photo,Canon,2002,8622.0,4.218743
2279,121,2203,3.0,Computers & Accessories,Linksys,2003,6278.0,4.168844
2486,145,2399,1.0,Camera & Photo,Canon,2004,5810.0,4.306368
2340,83,2258,5.0,Computers & Accessories,Linksys,2003,5790.0,4.625734
5795,300,5583,5.0,Computers & Accessories,,2006,5554.0,3.921318
1715,93,1665,3.0,Camera & Photo,Sony,2003,5482.0,4.797337
1124,3,1103,1.0,Camera & Photo,,2005,5353.0,3.998879
1886,39,1826,5.0,Camera & Photo,Sony,2001,5323.0,4.109149


## Phase 1 : Filtering

In [342]:
df['category'].unique()

array(['Camera & Photo', 'Computers & Accessories',
       'Portable Audio & Video', 'Accessories & Supplies', 'Headphones',
       'Television & Video', 'Home Audio', 'Car Electronics & GPS',
       'Security & Surveillance', 'Wearable Technology'], dtype=object)

In [343]:
category = 'Computers & Accessories'
brand = 'Toshiba'
year = (2000, 2016)
topk = 10

In [344]:
filter_criteria = (df['category'] == category) & (df['brand'] == brand) & (df['year'].between(year[0], year[1]))

In [345]:
result = df[ filter_criteria ]
result.head()

Unnamed: 0,item_id,user_id,rating,category,brand,year,count,mean
2031,119,1963,5.0,Computers & Accessories,Toshiba,2004,9393.0,4.418184
2237,144,2163,5.0,Computers & Accessories,Toshiba,2004,3777.0,4.368546
679,44,668,5.0,Computers & Accessories,Toshiba,2001,847.0,3.149941
2380,151,2199,5.0,Computers & Accessories,Toshiba,2006,671.0,4.19225
2083,119,2012,5.0,Computers & Accessories,Toshiba,2004,501.0,4.786427


## Phase 2 : Scoring (Sudah kita lakukan diawal tadi yaitu count dan mean)

## Phase 3 : Sorting

In [346]:
recommendation = result.sort_values("count", ascending=False).head(topk) 
recommendation

Unnamed: 0,item_id,user_id,rating,category,brand,year,count,mean
2031,119,1963,5.0,Computers & Accessories,Toshiba,2004,9393.0,4.418184
2237,144,2163,5.0,Computers & Accessories,Toshiba,2004,3777.0,4.368546
679,44,668,5.0,Computers & Accessories,Toshiba,2001,847.0,3.149941
2380,151,2199,5.0,Computers & Accessories,Toshiba,2006,671.0,4.19225
2083,119,2012,5.0,Computers & Accessories,Toshiba,2004,501.0,4.786427
2085,44,2014,1.0,Computers & Accessories,Toshiba,2001,489.0,4.351738
2138,125,2066,5.0,Computers & Accessories,Toshiba,2005,357.0,3.817927
2019,119,1951,5.0,Computers & Accessories,Toshiba,2004,321.0,4.47352
1839,119,1783,2.0,Computers & Accessories,Toshiba,2004,262.0,4.030534
1176,44,1153,1.0,Computers & Accessories,Toshiba,2001,124.0,3.41129


### export dataframe yang telah di modifikasi ke sebuah file dengan ekstensi csv

In [347]:
csv_data = df.to_csv('electronics.csv', index =False)

## ML engineering

In [348]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)
    
    def recommend(self, category=None, brand=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, category=category, brand=brand, year=year)
        
        result = df.sort_values("count", ascending=False)
        result = result.head(topk) 
        return result
    
    @staticmethod
    def demographic_filter(df, category=None, brand=None, year=None, topk=10):
        df = df.copy()
        
        if category is not None :
            df = df[(df['category'] == category)]
        if brand is not None :
            df = df[(df['brand'] == brand)]
        if year is not None :
            df = df[(df['year'].between(year[0], year[1]))]
        return df
    
    

In [349]:
recsys = RecommenderSystem(data='electronics.csv')

## Berikut adalah hasil rekomendasi berdasarkan barang dengan pembeli terbanyak dan juga telah kita filter terlebih dahulu

In [350]:
recsys.recommend(category='Computers & Accessories', brand='Toshiba', year=(2000,2016))

Unnamed: 0,item_id,user_id,rating,category,brand,year,count,mean
1,119,1963,5.0,Computers & Accessories,Toshiba,2004,9393.0,4.418184
30,144,2163,5.0,Computers & Accessories,Toshiba,2004,3777.0,4.368546
283,44,668,5.0,Computers & Accessories,Toshiba,2001,847.0,3.149941
375,151,2199,5.0,Computers & Accessories,Toshiba,2006,671.0,4.19225
550,119,2012,5.0,Computers & Accessories,Toshiba,2004,501.0,4.786427
565,44,2014,1.0,Computers & Accessories,Toshiba,2001,489.0,4.351738
777,125,2066,5.0,Computers & Accessories,Toshiba,2005,357.0,3.817927
864,119,1951,5.0,Computers & Accessories,Toshiba,2004,321.0,4.47352
1034,119,1783,2.0,Computers & Accessories,Toshiba,2004,262.0,4.030534
1914,44,1153,1.0,Computers & Accessories,Toshiba,2001,124.0,3.41129
