## Outliers

In [1]:
import numpy as np

L = [22, 25, 25, 25, 26, 26, 27, 29, 30, 31, 31, 31, 102]

data = np.array(L)

Q1 = np.percentile(data, 25)

Q2 = np.percentile(data, 50)

Q3 = np.percentile(data, 75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]

print(f"Original data: {L}")
print(f"Q1 (First Quartile): {Q1}")
print(f"Q2 (Median): {Q2}")
print(f"Q3 (Third Quartile): {Q3}")
print(f"IQR (Interquartile Range): {IQR}")
print(f"Lower bound for outliers: {lower_bound}")
print(f"Upper bound for outliers: {upper_bound}")
print(f"Filtered data (without outliers): {filtered_data}")


Original data: [22, 25, 25, 25, 26, 26, 27, 29, 30, 31, 31, 31, 102]
Q1 (First Quartile): 25.0
Q2 (Median): 27.0
Q3 (Third Quartile): 31.0
IQR (Interquartile Range): 6.0
Lower bound for outliers: 16.0
Upper bound for outliers: 40.0
Filtered data (without outliers): [22 25 25 25 26 26 27 29 30 31 31 31]


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r"C:\Users\admin\Desktop\Machine learning\Day-2\california_housing_test.csv")

In [4]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0


In [5]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [17]:
import warnings 
warnings.filterwarnings('ignore')
new_data = df.copy()
def outlier(col):
    data2 = np.array(col)
    Q1 = np.percentile(col, 25)
    Q3 = np.percentile(col, 75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    filter_data = new_data[(col < lower_bound) | (col > upper_bound)].index
    for i in filter_data:
        new_data.drop(i, inplace=True)
    return new_data

In [32]:
outlier(df['latitude'])
outlier(df['longitude'])
outlier(df['housing_median_age'])
outlier(df['total_rooms'])
outlier(df['total_bedrooms'])
outlier(df['population'])
outlier(df['households'])
outlier(df['median_house_value'])
outlier(df['median_income'])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2991,-117.17,34.28,13.0,4867.0,718.0,780.0,250.0,7.1997,253800.0
2994,-117.93,33.86,35.0,931.0,181.0,516.0,174.0,5.5867,182500.0
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0


In [33]:
describe = df.describe()['longitude']
describe['25%']

-121.81

In [34]:
np.percentile(new_data['total_rooms'],25)
describe['25%']

-121.81

In [47]:
def l1_normalization(series):
    return series / np.sum(np.abs(series))

In [48]:
print(l1_normalization(new_data['longitude']))
print(l1_normalization(new_data['latitude']))
print(l1_normalization(new_data['housing_median_age']))
print(l1_normalization(new_data['total_rooms']))
print(l1_normalization(new_data['total_bedrooms']))
print(l1_normalization(new_data['population']))
print(l1_normalization(new_data['households']))
print(l1_normalization(new_data['median_income']))
print(l1_normalization(new_data['median_house_value']))

0      -0.000395
1      -0.000383
2      -0.000382
3      -0.000384
4      -0.000388
          ...   
2991   -0.000380
2994   -0.000382
2995   -0.000388
2997   -0.000388
2998   -0.000379
Name: longitude, Length: 2580, dtype: float64
0       0.000406
1       0.000372
2       0.000367
3       0.000367
4       0.000394
          ...   
2991    0.000372
2994    0.000368
2995    0.000374
2997    0.000394
2998    0.000370
Name: latitude, Length: 2580, dtype: float64
0       0.000352
1       0.000560
2       0.000352
3       0.000365
4       0.000248
          ...   
2991    0.000169
2994    0.000456
2995    0.000300
2997    0.000130
2998    0.000521
Name: housing_median_age, Length: 2580, dtype: float64
0       0.000708
1       0.000275
2       0.000654
3       0.000012
4       0.000226
          ...   
2991    0.000887
2994    0.000170
2995    0.000264
2997    0.000174
2998    0.000018
Name: total_rooms, Length: 2580, dtype: float64
0       0.000579
1       0.000272
2       0.000444
3      

In [49]:
def l2_normalization(series):
    return series / np.sqrt(np.sum(np.square(series)))

In [50]:
print(l2_normalization(new_data['longitude']))
print(l2_normalization(new_data['latitude']))
print(l2_normalization(new_data['housing_median_age']))
print(l2_normalization(new_data['total_rooms']))
print(l2_normalization(new_data['total_bedrooms']))
print(l2_normalization(new_data['population']))
print(l2_normalization(new_data['households']))
print(l2_normalization(new_data['median_income']))
print(l2_normalization(new_data['median_house_value']))

0      -0.020085
1      -0.019468
2      -0.019387
3      -0.019477
4      -0.019693
          ...   
2991   -0.019282
2994   -0.019407
2995   -0.019724
2997   -0.019698
2998   -0.019273
Name: longitude, Length: 2580, dtype: float64
0       0.020573
1       0.018861
2       0.018597
3       0.018619
4       0.020001
          ...   
2991    0.018872
2994    0.018641
2995    0.018949
2997    0.019984
2998    0.018773
Name: latitude, Length: 2580, dtype: float64
0       0.016544
1       0.026347
2       0.016544
3       0.017156
4       0.011642
          ...   
2991    0.007965
2994    0.021445
2995    0.014093
2997    0.006127
2998    0.024509
Name: housing_median_age, Length: 2580, dtype: float64
0       0.031909
1       0.012402
2       0.029477
3       0.000550
4       0.010193
          ...   
2991    0.039974
2994    0.007647
2995    0.011909
2997    0.007852
2998    0.000788
Name: total_rooms, Length: 2580, dtype: float64
0       0.026250
1       0.012311
2       0.020134
3      