In [102]:
import pandas as pd 
import numpy as np
import seaborn as sns

In [103]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [104]:
df = sns.load_dataset("MPG")
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [106]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [107]:
df.shape

(398, 9)

In [108]:
df.sample(20)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
43,13.0,8,400.0,170.0,4746,12.0,71,usa,ford country squire (sw)
188,16.0,8,318.0,150.0,4190,13.0,76,usa,dodge coronet brougham
33,19.0,6,232.0,100.0,2634,13.0,71,usa,amc gremlin
156,16.0,8,400.0,170.0,4668,11.5,75,usa,pontiac catalina
215,13.0,8,318.0,150.0,3755,14.0,76,usa,dodge d100
278,31.5,4,89.0,71.0,1990,14.9,78,europe,volkswagen scirocco
164,21.0,6,231.0,110.0,3039,15.0,75,usa,buick skyhawk
223,15.5,8,318.0,145.0,4140,13.7,77,usa,dodge monaco brougham
331,33.8,4,97.0,67.0,2145,18.0,80,japan,subaru dl
10,15.0,8,383.0,170.0,3563,10.0,70,usa,dodge challenger se


In [109]:
df['name'].nunique()

305

In [110]:
df['name']

0      chevrolet chevelle malibu
1              buick skylark 320
2             plymouth satellite
3                  amc rebel sst
4                    ford torino
                 ...            
393              ford mustang gl
394                    vw pickup
395                dodge rampage
396                  ford ranger
397                   chevy s-10
Name: name, Length: 398, dtype: object

In [111]:
df.drop(columns = ['name'],inplace = True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [112]:
df['origin'].nunique()

3

In [113]:
x = df.iloc[:,1:]
x


Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,usa
1,8,350.0,165.0,3693,11.5,70,usa
2,8,318.0,150.0,3436,11.0,70,usa
3,8,304.0,150.0,3433,12.0,70,usa
4,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,usa
394,4,97.0,52.0,2130,24.6,82,europe
395,4,135.0,84.0,2295,11.6,82,usa
396,4,120.0,79.0,2625,18.6,82,usa


In [114]:
y = df['mpg']
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [115]:
from sklearn.model_selection import train_test_split

In [116]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [117]:
x_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
204,4,85.0,70.0,1990,17.0,76,japan
186,4,101.0,83.0,2202,15.3,76,europe
3,8,304.0,150.0,3433,12.0,70,usa
390,4,144.0,96.0,2665,13.9,82,japan
193,6,200.0,81.0,3012,17.6,76,usa
...,...,...,...,...,...,...,...
299,4,141.0,71.0,3190,24.8,79,europe
22,4,104.0,95.0,2375,17.5,70,europe
72,8,304.0,150.0,3892,12.5,72,usa
15,6,198.0,95.0,2833,15.5,70,usa


In [118]:
x_test

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
94,8,440.0,215.0,4735,11.0,73,usa
32,4,98.0,,2046,19.0,71,usa
279,4,98.0,68.0,2135,16.6,78,japan
178,4,120.0,88.0,2957,17.0,75,europe
354,4,100.0,,2320,15.8,81,europe
...,...,...,...,...,...,...,...
298,8,350.0,125.0,3900,17.4,79,usa
222,8,260.0,110.0,4060,19.0,77,usa
65,8,351.0,153.0,4129,13.0,72,usa
261,6,258.0,120.0,3410,15.1,78,usa


In [119]:
y_train

204    32.0
186    27.0
3      16.0
390    32.0
193    24.0
       ... 
299    27.2
22     25.0
72     15.0
15     22.0
168    23.0
Name: mpg, Length: 318, dtype: float64

In [120]:
y_test

94     13.0
32     25.0
279    29.5
178    23.0
354    34.5
       ... 
298    23.0
222    17.0
65     14.0
261    18.1
342    30.0
Name: mpg, Length: 80, dtype: float64

In [121]:
from sklearn.preprocessing import StandardScaler

In [122]:
tr = ColumnTransformer(transformers = [
    ('tn1',SimpleImputer(),["horsepower"]),
    ('std',StandardScaler(),['weight']),
    ('t2',OneHotEncoder(),['origin'])
],remainder='passthrough')

In [123]:
tr

ColumnTransformer(remainder='passthrough',
                  transformers=[('tn1', SimpleImputer(), ['horsepower']),
                                ('std', StandardScaler(), ['weight']),
                                ('t2', OneHotEncoder(), ['origin'])])

In [130]:
a = tr.fit_transform(x_train)

In [131]:
b = tr.fit_transform(x_test)

In [132]:
tr.fit_transform(x_train).shape

(318, 9)

In [127]:
from sklearn.linear_model import LinearRegression

In [128]:
lr =  LinearRegression()

In [129]:
lr.fit(a,y_train)

LinearRegression()

In [133]:
lr.predict(b)

array([10.97915314, 22.21753333, 30.87592665, 23.89995097, 31.33788521,
        7.20427382, 10.7534423 , 15.48157739, 14.23105871, 20.87079776,
       25.15140726, 24.73372025, 30.93728667, 21.38105567, 28.76197943,
       27.39150619, 21.2561761 , 30.5807089 , 35.306612  , 15.39741753,
       31.60669592, 21.5432606 , 25.71461035, 28.73419472, 11.86481782,
       20.30621563, 21.50975765, 20.82351913, 33.75875727, 20.76563085,
       27.98741355, 31.66443528, 19.99481821, 29.82381082, 11.19965846,
       30.83787998, 23.41385691, 27.01914355,  5.73085398, 32.3644768 ,
       18.94642601, 22.94180979, 21.5425674 , 25.88936262, 27.19756603,
       32.13606832, 17.50268231, 14.19047877, 33.98780036, 25.61517014,
       24.80930136, 23.05806333, 25.64128448, 27.31134136, 24.05859759,
       11.17085015, 27.91044311, 22.98307747, 16.17029013, 22.51061856,
       28.42902104, 29.56147147, 35.38260247, 24.69278269, 16.4722312 ,
       33.13702339, 20.57578974, 26.41731228, 11.28662687, 33.97

In [138]:
lr.score(a,y_train)

0.8105178787530587