In [4]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('auto-mpg (3).csv')

In [6]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [7]:
mpg = data['mpg'].to_numpy()  # mpg column into numpy array

### basic array operation

In [8]:
np.mean(mpg)


np.float64(23.514572864321607)

In [9]:
np.median(mpg)

np.float64(23.0)

In [10]:
np.std(mpg)

np.float64(7.806159061274433)

In [11]:
np.sum(mpg > 25)

np.int64(158)

### filtering

In [12]:
print(data.columns)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')


In [13]:
filtered = data[data['cylinders'] > 6]
car_names = filtered['car name'].tolist()
print(car_names)


['chevrolet chevelle malibu', 'buick skylark 320', 'plymouth satellite', 'amc rebel sst', 'ford torino', 'ford galaxie 500', 'chevrolet impala', 'plymouth fury iii', 'pontiac catalina', 'amc ambassador dpl', 'dodge challenger se', "plymouth 'cuda 340", 'chevrolet monte carlo', 'buick estate wagon (sw)', 'ford f250', 'chevy c20', 'dodge d200', 'hi 1200d', 'chevrolet impala', 'pontiac catalina brougham', 'ford galaxie 500', 'plymouth fury iii', 'dodge monaco (sw)', 'ford country squire (sw)', 'pontiac safari (sw)', 'chevrolet impala', 'pontiac catalina', 'plymouth fury iii', 'ford galaxie 500', 'amc ambassador sst', 'mercury marquis', 'buick lesabre custom', 'oldsmobile delta 88 royale', 'chrysler newport royal', 'amc matador (sw)', 'chevrolet chevelle concours (sw)', 'ford gran torino (sw)', 'plymouth satellite custom (sw)', 'buick century 350', 'amc matador', 'chevrolet malibu', 'ford gran torino', 'dodge coronet custom', 'mercury marquis brougham', 'chevrolet caprice classic', 'ford l

### Statistical Analysis

In [14]:
weight = data['weight'].to_numpy()
percentiles = np.percentile(weight, [25, 50, 75])
print(percentiles)

[2223.75 2803.5  3608.  ]


### Array manipulation

In [15]:
acc = data['acceleration'].to_numpy()
normalized_acc = (acc - np.min(acc)) / (np.max(acc) - np.min(acc))
print(normalized_acc)


[0.23809524 0.20833333 0.17857143 0.23809524 0.14880952 0.11904762
 0.05952381 0.0297619  0.11904762 0.0297619  0.11904762 0.
 0.08928571 0.11904762 0.41666667 0.44642857 0.44642857 0.47619048
 0.38690476 0.74404762 0.56547619 0.38690476 0.56547619 0.26785714
 0.41666667 0.35714286 0.41666667 0.32738095 0.625      0.38690476
 0.44642857 0.35714286 0.6547619  0.29761905 0.44642857 0.44642857
 0.44642857 0.44642857 0.23809524 0.20833333 0.32738095 0.29761905
 0.20833333 0.23809524 0.23809524 0.32738095 0.6547619  0.41666667
 0.38690476 0.35714286 0.35714286 0.68452381 0.38690476 0.6547619
 0.5952381  0.6547619  0.74404762 0.44642857 0.53571429 0.92261905
 0.68452381 0.50595238 0.23809524 0.23809524 0.32738095 0.29761905
 0.20833333 0.17857143 0.32738095 0.32738095 0.26785714 0.32738095
 0.26785714 0.35714286 0.47619048 0.35714286 0.38690476 0.5952381
 0.68452381 0.5952381  0.47619048 0.53571429 0.38690476 0.41666667
 0.50595238 0.29761905 0.20833333 0.29761905 0.38690476 0.26785714
 0.20

### sorting

In [16]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [17]:
sorted_data = data.sort_values(by='model year', ascending=False)['car name'][0:6]
print(sorted_data)

397          chevy s-10
396         ford ranger
395       dodge rampage
394           vw pickup
393     ford mustang gl
392    chevrolet camaro
Name: car name, dtype: object


### broadcasting

In [18]:
horsepower = data['horsepower'].to_numpy()
updated_hp = horsepower * 10
print(updated_hp)


['130130130130130130130130130130' '165165165165165165165165165165'
 '150150150150150150150150150150' '150150150150150150150150150150'
 '140140140140140140140140140140' '198198198198198198198198198198'
 '220220220220220220220220220220' '215215215215215215215215215215'
 '225225225225225225225225225225' '190190190190190190190190190190'
 '170170170170170170170170170170' '160160160160160160160160160160'
 '150150150150150150150150150150' '225225225225225225225225225225'
 '95959595959595959595' '95959595959595959595' '97979797979797979797'
 '85858585858585858585' '88888888888888888888' '46464646464646464646'
 '87878787878787878787' '90909090909090909090' '95959595959595959595'
 '113113113113113113113113113113' '90909090909090909090'
 '215215215215215215215215215215' '200200200200200200200200200200'
 '210210210210210210210210210210' '193193193193193193193193193193'
 '88888888888888888888' '90909090909090909090' '95959595959595959595'
 '??????????' '100100100100100100100100100100'
 '10510510510

### Correlation

In [19]:
corr = np.corrcoef(data['mpg'], data['weight'])
print(corr)

[[ 1.         -0.83174093]
 [-0.83174093  1.        ]]


### Conditional Aggregates

In [20]:
mean_mpg_cyn = data.groupby('cylinders')['mpg'].mean()
print(mean_mpg_cyn)

cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64


In [21]:
mean_mpg_cyn = data.groupby('cylinders')['mpg'].mean()
mean_mpg_cyn.mean()

np.float64(22.430450490875963)

# pandas

In [22]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [23]:
data.head(10)    # The first 10 rows

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [24]:
data.shape      # The total number of rows and columns

(398, 9)

In [25]:
data.describe()    # summary statistics for numerical columns

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [26]:
# Filtering and Indexing
filtered_data = data.loc[(data['model year'] == 75)&(data['weight']) < 3000,['car name','weight','mpg']]
print(filtered_data)



                      car name  weight   mpg
0    chevrolet chevelle malibu    3504  18.0
1            buick skylark 320    3693  15.0
2           plymouth satellite    3436  18.0
3                amc rebel sst    3433  16.0
4                  ford torino    3449  17.0
..                         ...     ...   ...
393            ford mustang gl    2790  27.0
394                  vw pickup    2130  44.0
395              dodge rampage    2295  32.0
396                ford ranger    2625  28.0
397                 chevy s-10    2720  31.0

[398 rows x 3 columns]


In [27]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [28]:
# handling Missing Data
data.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,0
weight,0
acceleration,0
model year,0
origin,0
car name,0


In [29]:
# Group By
g = data.groupby("origin")

In [30]:
g["mpg"].mean()

Unnamed: 0_level_0,mpg
origin,Unnamed: 1_level_1
1,20.083534
2,27.891429
3,30.450633


In [31]:
# sorting

data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [32]:
data.sort_values(by='mpg',ascending=False)[['car name','mpg']].head(10)

Unnamed: 0,car name,mpg
322,mazda glc,46.6
329,honda civic 1500 gl,44.6
325,vw rabbit c (diesel),44.3
394,vw pickup,44.0
326,vw dasher (diesel),43.4
244,volkswagen rabbit custom diesel,43.1
309,vw rabbit,41.5
330,renault lecar deluxe,40.9
324,datsun 210,40.8
247,datsun b210 gx,39.4


In [34]:
# apply fuction

def performance_score(row):
    return row['mpg'] * row['acceleration'] / row['weight']


In [36]:
data['performance_score'] = data.apply(performance_score, axis=1)
print(data[['car name', 'mpg', 'acceleration', 'weight', 'performance_score']].head())

                    car name   mpg  acceleration  weight  performance_score
0  chevrolet chevelle malibu  18.0          12.0    3504           0.061644
1          buick skylark 320  15.0          11.5    3693           0.046710
2         plymouth satellite  18.0          11.0    3436           0.057625
3              amc rebel sst  16.0          12.0    3433           0.055928
4                ford torino  17.0          10.5    3449           0.051754


In [48]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,performance_score
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,0.061644
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,0.046710
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,0.057625
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,0.055928
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,0.051754
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl,0.150968
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup,0.508169
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage,0.161743
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger,0.198400


In [74]:
# exporting data
high_mpg_cars = data[data['mpg'] > 30][['mpg', 'cylinders', 'horsepower', 'weight']]
high_mpg_cars.to_csv('high_mpg_cars.csv')
print(high_mpg_cars)

      mpg  cylinders horsepower  weight
53   31.0          4         65    1773
54   35.0          4         69    1613
129  31.0          4         67    1950
131  32.0          4         65    1836
144  31.0          4         52    1649
..    ...        ...        ...     ...
390  32.0          4         96    2665
391  36.0          4         84    2370
394  44.0          4         52    2130
395  32.0          4         84    2295
397  31.0          4         82    2720

[85 rows x 4 columns]


In [66]:
#finding anomalies
Q1 = data['mpg'].quantile(0.25)
Q3 = data['mpg'].quantile(0.75)
IQR = Q3 - Q1

In [69]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [71]:
outliers = data[(data['mpg'] < lower_bound) | (data['mpg'] > upper_bound)]
print(outliers[['car name', 'mpg', 'model year']])

      car name   mpg  model year
322  mazda glc  46.6          80
