In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('airquality_data.csv', encoding='cp1252', low_memory=False)

In [3]:
df

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",4.8,17.4,,,,,1990-02-01
1,151,February - M021990,Andhra Pradesh,Hyderabad,,Industrial Area,3.1,7.0,,,,,1990-02-01
2,152,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.2,28.5,,,,,1990-02-01
3,150,March - M031990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.3,14.7,,,,,1990-03-01
4,151,March - M031990,Andhra Pradesh,Hyderabad,,Industrial Area,4.7,7.5,,,,,1990-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,22.0,50.0,143.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-24
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,20.0,46.0,171.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-29
435739,,,andaman-and-nicobar-islands,,,,,,,,,,
435740,,,Lakshadweep,,,,,,,,,,


In [4]:
df.type.unique()

array(['Residential, Rural and other Areas', 'Industrial Area', nan,
       'Sensitive Area', 'Industrial Areas', 'Residential and others',
       'Sensitive Areas', 'Industrial', 'Residential', 'RIRUO',
       'Sensitive'], dtype=object)

In [5]:
dic = {
    'Residential, Rural and other Areas' : 'RRO',
    'Industrial Area' : 'I',
    np.NAN : 'RRO',
    'Sensitive Area': 'S',
    'Industrial Areas':'I',
    'Residential and others' : 'RO',
    'Sensitive Areas' : 'S',
    'Industrial': 'I', 
    'Residential' : 'R',
    'RIRUO' : 'RRO',
    'Sensitive' : 'S'
}

In [6]:
df.type= df.type.replace(dic)

In [7]:
df

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150,February - M021990,Andhra Pradesh,Hyderabad,,RRO,4.8,17.4,,,,,1990-02-01
1,151,February - M021990,Andhra Pradesh,Hyderabad,,I,3.1,7.0,,,,,1990-02-01
2,152,February - M021990,Andhra Pradesh,Hyderabad,,RRO,6.2,28.5,,,,,1990-02-01
3,150,March - M031990,Andhra Pradesh,Hyderabad,,RRO,6.3,14.7,,,,,1990-03-01
4,151,March - M031990,Andhra Pradesh,Hyderabad,,I,4.7,7.5,,,,,1990-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RRO,22.0,50.0,143.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-24
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RRO,20.0,46.0,171.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-29
435739,,,andaman-and-nicobar-islands,,,RRO,,,,,,,
435740,,,Lakshadweep,,,RRO,,,,,,,


In [8]:
# defining columns of importance, which shall be used reguarly
COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']

In [9]:
from sklearn.impute import SimpleImputer
simpleimputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[COLS] = simpleimputer.fit_transform(df[COLS])

In [10]:
df

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150,February - M021990,Andhra Pradesh,Hyderabad,,RRO,4.800000,17.400000,108.832784,220.78348,,40.791467,1990-02-01
1,151,February - M021990,Andhra Pradesh,Hyderabad,,I,3.100000,7.000000,108.832784,220.78348,,40.791467,1990-02-01
2,152,February - M021990,Andhra Pradesh,Hyderabad,,RRO,6.200000,28.500000,108.832784,220.78348,,40.791467,1990-02-01
3,150,March - M031990,Andhra Pradesh,Hyderabad,,RRO,6.300000,14.700000,108.832784,220.78348,,40.791467,1990-03-01
4,151,March - M031990,Andhra Pradesh,Hyderabad,,I,4.700000,7.500000,108.832784,220.78348,,40.791467,1990-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RRO,22.000000,50.000000,143.000000,220.78348,"Inside Rampal Industries,ULUBERIA",40.791467,2015-12-24
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RRO,20.000000,46.000000,171.000000,220.78348,"Inside Rampal Industries,ULUBERIA",40.791467,2015-12-29
435739,,,andaman-and-nicobar-islands,,,RRO,10.829414,25.809623,108.832784,220.78348,,40.791467,
435740,,,Lakshadweep,,,RRO,10.829414,25.809623,108.832784,220.78348,,40.791467,


In [11]:
df.dtypes

stn_code                        object
sampling_date                   object
state                           object
location                        object
agency                          object
type                            object
so2                            float64
no2                            float64
rspm                           float64
spm                            float64
location_monitoring_station     object
pm2_5                          float64
date                            object
dtype: object

In [12]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [13]:
df.dtypes

stn_code                               object
sampling_date                          object
state                                  object
location                               object
agency                                 object
type                                   object
so2                                   float64
no2                                   float64
rspm                                  float64
spm                                   float64
location_monitoring_station            object
pm2_5                                 float64
date                           datetime64[ns]
dtype: object

In [14]:
df['year'] = df.date.dt.year
df.head(5)

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date,year
0,150,February - M021990,Andhra Pradesh,Hyderabad,,RRO,4.8,17.4,108.832784,220.78348,,40.791467,1990-02-01,1990.0
1,151,February - M021990,Andhra Pradesh,Hyderabad,,I,3.1,7.0,108.832784,220.78348,,40.791467,1990-02-01,1990.0
2,152,February - M021990,Andhra Pradesh,Hyderabad,,RRO,6.2,28.5,108.832784,220.78348,,40.791467,1990-02-01,1990.0
3,150,March - M031990,Andhra Pradesh,Hyderabad,,RRO,6.3,14.7,108.832784,220.78348,,40.791467,1990-03-01,1990.0
4,151,March - M031990,Andhra Pradesh,Hyderabad,,I,4.7,7.5,108.832784,220.78348,,40.791467,1990-03-01,1990.0


In [15]:
types ={"RRO":1, "I":2, "RO":3,"S":4,"RIRUO":5,"R":6}

df ['type'] = df['type'].replace(types)

In [16]:
df

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date,year
0,150,February - M021990,Andhra Pradesh,Hyderabad,,1,4.800000,17.400000,108.832784,220.78348,,40.791467,1990-02-01,1990.0
1,151,February - M021990,Andhra Pradesh,Hyderabad,,2,3.100000,7.000000,108.832784,220.78348,,40.791467,1990-02-01,1990.0
2,152,February - M021990,Andhra Pradesh,Hyderabad,,1,6.200000,28.500000,108.832784,220.78348,,40.791467,1990-02-01,1990.0
3,150,March - M031990,Andhra Pradesh,Hyderabad,,1,6.300000,14.700000,108.832784,220.78348,,40.791467,1990-03-01,1990.0
4,151,March - M031990,Andhra Pradesh,Hyderabad,,2,4.700000,7.500000,108.832784,220.78348,,40.791467,1990-03-01,1990.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,1,22.000000,50.000000,143.000000,220.78348,"Inside Rampal Industries,ULUBERIA",40.791467,2015-12-24,2015.0
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,1,20.000000,46.000000,171.000000,220.78348,"Inside Rampal Industries,ULUBERIA",40.791467,2015-12-29,2015.0
435739,,,andaman-and-nicobar-islands,,,1,10.829414,25.809623,108.832784,220.78348,,40.791467,NaT,
435740,,,Lakshadweep,,,1,10.829414,25.809623,108.832784,220.78348,,40.791467,NaT,


In [17]:
df.state.unique()

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chandigarh', 'Chhattisgarh', 'Dadra & Nagar Haveli',
       'Daman & Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu & Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
       'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Uttar Pradesh',
       'Uttarakhand', 'Uttaranchal', 'West Bengal',
       'andaman-and-nicobar-islands', 'Lakshadweep', 'Tripura'],
      dtype=object)

In [18]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["state"]=labelencoder.fit_transform(df["state"])
df.head(70)

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date,year
0,150,February - M021990,0,Hyderabad,,1,4.8,17.4,108.832784,220.78348,,40.791467,1990-02-01,1990.0
1,151,February - M021990,0,Hyderabad,,2,3.1,7.0,108.832784,220.78348,,40.791467,1990-02-01,1990.0
2,152,February - M021990,0,Hyderabad,,1,6.2,28.5,108.832784,220.78348,,40.791467,1990-02-01,1990.0
3,150,March - M031990,0,Hyderabad,,1,6.3,14.7,108.832784,220.78348,,40.791467,1990-03-01,1990.0
4,151,March - M031990,0,Hyderabad,,2,4.7,7.5,108.832784,220.78348,,40.791467,1990-03-01,1990.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,95,January - M011992,0,Hyderabad,Andhra Pradesh Pollution Control Board,1,20.0,64.6,108.832784,353.00000,,40.791467,1992-01-01,1992.0
66,202,January - M011992,0,Hyderabad,Andhra Pradesh Pollution Control Board,1,14.6,4.4,108.832784,190.00000,,40.791467,1992-01-01,1992.0
67,203,January - M011992,0,Hyderabad,Andhra Pradesh Pollution Control Board,1,35.8,12.5,108.832784,261.00000,,40.791467,1992-01-01,1992.0
68,232,January - M011992,0,Vishakhapatnam,Andhra Pradesh Pollution Control Board,1,52.6,89.6,108.832784,679.00000,,40.791467,1992-01-01,1992.0


In [19]:
dfAndhra=df[(df['state']==0)]

In [20]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder=OneHotEncoder(sparse=False,handle_unknown='error',drop='first')
pd.DataFrame(onehotencoder.fit_transform(dfAndhra[["location"]]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
onehotencoder.categories_

[array(['Ananthapur', 'Chittoor', 'Eluru', 'Guntur', 'Hyderabad', 'Kadapa',
        'Kakinada', 'Karimnagar', 'Khammam', 'Kurnool', 'Nalgonda',
        'Nellore', 'Nizamabad', 'Ongole', 'Patancheru', 'Rajahmundry',
        'Ramagundam', 'Sangareddy', 'Srikakulam', 'Tirupati', 'Vijayawada',
        'Visakhapatnam', 'Vishakhapatnam', 'Vizianagaram', 'Warangal'],
       dtype=object)]

In [22]:
# 1 -> 0 0
# 2 -> 1 0
# 3 -> 0 1