In [2]:
import pandas as pd
import numpy as np

In [1]:
data_path = "data/ratings.csv"

In [3]:
data = pd.read_csv(data_path)

### Describe dataframe

In [4]:
data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


### Find 10 random columns in the data

In [5]:
data.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
50644,373,1387,4.0,939235173
14874,96,25850,5.0,1223256486
88072,585,3429,4.0,975362198
57769,418,43836,1.5,1145048460
87996,585,2078,4.0,975363316
21689,150,3408,2.5,1114306379
76578,529,81845,4.0,1371270054
28990,212,6541,2.5,1218955714
1720,15,3943,3.0,997937860
3326,19,891,3.0,855194072


### Pandas Selecting and indexing

In [12]:
data.iloc[0:10, :]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [15]:
data.loc[data['movieId'] == 31]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
498,7,31,3.0,851868750
6059,31,31,4.0,1273541953
6130,32,31,4.0,834828440
6526,36,31,3.0,847057202
6773,39,31,3.0,832525157
10223,73,31,3.5,1255591860
13516,88,31,3.0,1239755559
14810,96,31,2.5,1223256331
16867,110,31,4.0,840100695


### Data types of columns of dataframe

In [16]:
data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Binning in pandas (Groupby values)

In [21]:
pd.unique(data['rating'])

array([2.5, 3. , 2. , 4. , 3.5, 1. , 5. , 4.5, 1.5, 0.5])

In [33]:
pd.cut(data['rating'], [0, 1, 2, 3, 4, 5], labels=["very_low", "low", "medium", "high", "very_high"], include_lowest=True).value_counts(sort=False)

very_low      4427
low           8958
medium       24513
high         39288
very_high    22818
Name: rating, dtype: int64

### Crosstabulation to groupby values in pandas

In [35]:
pd.crosstab(data["movieId"], data["rating"], margins=True)

rating,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,All
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,4,3,13,4,41,23,77,19,63,247
2,0,0,1,12,6,37,10,27,1,13,107
3,1,3,2,9,2,18,2,13,2,7,59
4,0,3,1,1,0,7,1,0,0,0,13
5,0,3,0,5,2,23,3,12,5,3,56
6,0,1,0,1,3,26,6,38,5,24,104
7,2,2,0,5,0,20,4,13,0,7,53
8,0,1,0,0,0,0,0,2,0,2,5
9,0,1,0,1,0,13,0,4,0,1,20
10,0,1,1,6,5,44,17,36,3,9,122


### One hot encoding data

In [37]:
forest_fires_data_path = "data/forestfires.csv"

In [38]:
forest_fires_df = pd.read_csv(forest_fires_data_path)

In [40]:
forest_fires_df.sample(10)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
210,4,4,oct,sat,90.6,43.7,686.9,6.7,18.4,25,3.1,0.0,24.23
475,2,5,jun,thu,93.7,121.7,350.2,18.0,22.7,40,9.4,0.0,3.19
248,8,6,aug,wed,93.1,157.3,666.7,13.5,28.7,28,2.7,0.0,0.0
56,4,3,sep,tue,90.3,80.7,730.2,6.3,17.8,63,4.9,0.0,0.0
489,4,4,aug,wed,95.1,141.3,605.8,17.7,20.6,58,1.3,0.0,0.0
223,2,2,jul,fri,88.3,150.3,309.9,6.8,13.4,79,3.6,0.0,37.02
508,1,2,aug,fri,91.0,166.9,752.6,7.1,25.9,41,3.6,0.0,0.0
453,4,5,aug,thu,89.4,266.2,803.3,5.6,17.4,54,3.1,0.0,0.0
72,5,4,mar,fri,91.7,33.3,77.5,9.0,15.6,25,6.3,0.0,0.0
29,6,3,sep,sun,93.5,149.3,728.6,8.1,22.8,39,3.6,0.0,0.0


In [42]:
pd.get_dummies(forest_fires_df).sample(10)

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_nov,month_oct,month_sep,day_fri,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
7,8,6,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,...,0,0,0,0,1,0,0,0,0,0
16,5,5,91.7,35.8,80.8,7.8,15.1,27,5.4,0.0,...,0,0,0,0,0,1,0,0,0,0
25,7,4,91.4,142.4,601.4,10.6,16.3,60,5.4,0.0,...,0,0,0,0,0,0,1,0,0,0
227,8,6,92.2,81.8,480.8,11.9,20.1,34,4.5,0.0,...,0,0,0,0,0,1,0,0,0,0
118,3,4,90.1,39.7,86.6,6.2,10.6,30,4.0,0.0,...,0,0,0,0,1,0,0,0,0,0
51,4,3,90.2,99.6,631.2,6.3,21.5,34,2.2,0.0,...,0,0,0,0,0,0,1,0,0,0
184,8,6,93.9,135.7,586.7,15.1,20.8,34,4.9,0.0,...,0,0,0,1,0,0,0,0,0,0
152,3,4,90.1,51.2,424.1,6.2,24.6,43,1.8,0.0,...,0,0,0,0,0,1,0,0,0,0
460,2,5,93.7,231.1,715.1,8.4,18.9,64,4.9,0.0,...,0,0,0,0,0,1,0,0,0,0
513,2,4,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,...,0,0,0,0,0,0,1,0,0,0
