In [2]:
import numpy as np
import pandas as pd

In [8]:
# names is used here to give column names as required
# skiprows to skip the first row which is the column name in the dataset which is already defined other than what we defined in names 

df = pd.read_csv('income.csv', names = ["name", "income"], skiprows = [0])
df

Unnamed: 0,name,income
0,Rob,5000
1,Rafiq,6000
2,Nina,4000
3,Sofia,7500
4,Mohan,8000
5,Tao,7000
6,Elon Musk,10000000


In [10]:
#describe gives the information of the dataset such as mean, percentiles,..

df.income.describe()

count    7.000000e+00
mean     1.433929e+06
std      3.777283e+06
min      4.000000e+03
25%      5.500000e+03
50%      7.000000e+03
75%      7.750000e+03
max      1.000000e+07
Name: income, dtype: float64

In [22]:
df.income.quantile(0.25, interpolation = "lower")

5000

In [24]:
df.income.quantile(0.75)

7750.0

In [26]:
df.income.quantile(0.5, interpolation = "lower")

7000

In [14]:
df.income.quantile(1)

10000000.0

In [28]:
perc_99 = df.income.quantile(0.99)
perc_99

9400479.999999994

In [30]:
df[df.income > perc_99]

Unnamed: 0,name,income
6,Elon Musk,10000000


Removing an Outlier

In [32]:
df_no_outlier = df[df.income < perc_99]
df_no_outlier

Unnamed: 0,name,income
0,Rob,5000
1,Rafiq,6000
2,Nina,4000
3,Sofia,7500
4,Mohan,8000
5,Tao,7000


Filling a Null Value

In [40]:
df['income'][3] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['income'][3] = np.NaN


In [42]:
df

Unnamed: 0,name,income
0,Rob,5000.0
1,Rafiq,6000.0
2,Nina,4000.0
3,Sofia,
4,Mohan,8000.0
5,Tao,7000.0
6,Elon Musk,10000000.0


In [44]:
df.income.mean()

1671666.6666666667

As the outlier influence is too much on the mean as seen above, it is advisible to use median value to fill the null value in this case.

In [46]:
df.income.median()

6500.0

In [48]:
df_noNull = df.fillna(df.income.median())
df_noNull

Unnamed: 0,name,income
0,Rob,5000.0
1,Rafiq,6000.0
2,Nina,4000.0
3,Sofia,6500.0
4,Mohan,8000.0
5,Tao,7000.0
6,Elon Musk,10000000.0


Working with Airbnb New York City Data Set

In [52]:
df1 = pd.read_csv('AB_NYC_2019.csv')
df1.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [56]:
df1['price']

0        149
1        225
2        150
3         89
4         80
        ... 
48890     70
48891     40
48892    115
48893     55
48894     90
Name: price, Length: 48895, dtype: int64

In [58]:
df1.size

782320

In [60]:
df1.shape #There are 48895 records

(48895, 16)

In [64]:
df1.price.isnull().sum() #There are no null values in the price column

0

In [66]:
df1.price.describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [68]:
df1.price.mean()

152.7206871868289

In [70]:
df1.price.median()

106.0

We can see from above that the mean is effected from outliers

In [72]:
df1.price.quantile(0.25)

69.0

In [74]:
df1.price.quantile(0)

0.0

In [76]:
df1.price.quantile(0.75)

175.0

In [78]:
df1.price.quantile(1)

10000.0

In [80]:
df1.price.quantile(0.65)

149.0

In [82]:
df1.price.quantile(0.95)

355.0

In [84]:
df1.price.quantile(0.9)

269.0

In [86]:
df1.price.quantile(0.99)

799.0

In [100]:
df1.price[df1.price >= df1.price.quantile(1)]

9151     10000
17692    10000
29238    10000
Name: price, dtype: int64

Consider min and max threshold to reduce outliers 

In [110]:
minThr, maxThr = df1.price.quantile([0.01, 0.999])

In [112]:
minThr, maxThr

(30.0, 3000.0)

In [140]:
df1.price[df1.price <= minThr].count()

659

In [142]:
df1.price[df1.price >=  maxThr].count()

53

In [128]:
df2 = df1[(df1.price > minThr) & (df1.price < maxThr)]
df2.shape

(48183, 16)

In [144]:
df2.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [146]:
df2.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
10981,8480087,West Side Studio Apartment,44660079,Leslie,Manhattan,Upper West Side,40.77875,-73.9875,Entire home/apt,165,5,25,2019-05-24,0.54,2,70
41638,32370730,CITY VIEWS LARGE APARTMENT,243037591,Maria,Manhattan,Flatiron District,40.74189,-73.99019,Entire home/apt,488,3,14,2019-07-02,3.65,1,76
8846,6784378,Large Room & Own Bath by train,7788268,Marites,Brooklyn,Prospect-Lefferts Gardens,40.65894,-73.95947,Private room,75,2,10,2018-05-07,0.22,1,52
36480,28994671,한성 韓城 Han A (2FL),92706260,Kane,Queens,Flushing,40.76126,-73.81549,Private room,48,1,35,2019-07-01,3.79,5,79
45557,34832415,Gay friendly,6503950,Rob,Brooklyn,Bushwick,40.69339,-73.90588,Private room,50,1,6,2019-06-30,4.74,2,311
14834,11751718,Modern 2 BR with high end Finishes,61391963,Corporate Housing,Manhattan,Kips Bay,40.74128,-73.98039,Entire home/apt,159,30,3,2019-03-01,0.11,91,310
32962,25998807,"Great Private Room in Midtown, great Location!",193364875,Silva,Manhattan,Hell's Kitchen,40.76311,-73.98704,Private room,98,2,64,2019-07-05,5.65,1,54
37579,29806992,"Luxurious townhouse, 2bd w/Loft+2bath+High-cei...",15058648,Shola,Brooklyn,Bushwick,40.69798,-73.92968,Entire home/apt,135,3,5,2019-07-03,0.71,3,17
15085,12034640,Comfortable 420 Friendly Room,64411228,Virginia,Brooklyn,Williamsburg,40.70922,-73.95232,Private room,64,1,25,2019-05-13,0.63,1,0
23798,19218475,Cozy Private Large Bed in Prime Chelsea,134510659,Josif,Manhattan,Chelsea,40.74319,-73.99636,Private room,90,14,0,,,1,0


In [148]:
df2.price.describe()

count    48183.000000
mean       148.772036
std        153.594795
min         31.000000
25%         70.000000
50%        110.000000
75%        179.000000
max       2999.000000
Name: price, dtype: float64

References: 

https://github.com/codebasics/math-for-machine-learning/blob/main/4_mean_percentile/Exercise/exercise.md

https://github.com/codebasics/math-for-machine-learning/blob/main/4_mean_percentile/median_percentile.ipynb

https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data/data