# NumPy Problems

#### 1. Basic Array Operations
Convert the mpg column into a NumPy array and calculate:
* The mean, median, and standard deviation of mpg.
* The number of cars with mpg greater than 25.

In [143]:
import pandas as pd
import numpy as np

In [144]:
df = pd.read_csv("C:/Users/Acer/Downloads/auto-mpg.csv")
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [145]:
array_mpg = np.array(df["mpg"])
array_mpg

array([18. , 15. , 18. , 16. , 17. , 15. , 14. , 14. , 14. , 15. , 15. ,
       14. , 15. , 14. , 24. , 22. , 18. , 21. , 27. , 26. , 25. , 24. ,
       25. , 26. , 21. , 10. , 10. , 11. ,  9. , 27. , 28. , 25. , 25. ,
       19. , 16. , 17. , 19. , 18. , 14. , 14. , 14. , 14. , 12. , 13. ,
       13. , 18. , 22. , 19. , 18. , 23. , 28. , 30. , 30. , 31. , 35. ,
       27. , 26. , 24. , 25. , 23. , 20. , 21. , 13. , 14. , 15. , 14. ,
       17. , 11. , 13. , 12. , 13. , 19. , 15. , 13. , 13. , 14. , 18. ,
       22. , 21. , 26. , 22. , 28. , 23. , 28. , 27. , 13. , 14. , 13. ,
       14. , 15. , 12. , 13. , 13. , 14. , 13. , 12. , 13. , 18. , 16. ,
       18. , 18. , 23. , 26. , 11. , 12. , 13. , 12. , 18. , 20. , 21. ,
       22. , 18. , 19. , 21. , 26. , 15. , 16. , 29. , 24. , 20. , 19. ,
       15. , 24. , 20. , 11. , 20. , 21. , 19. , 15. , 31. , 26. , 32. ,
       25. , 16. , 16. , 18. , 16. , 13. , 14. , 14. , 14. , 29. , 26. ,
       26. , 31. , 32. , 28. , 24. , 26. , 24. , 26

In [146]:
mean_mpg = np.mean(array_mpg)
mean_mpg

23.514572864321607

In [147]:
median_mpg = np.median(array_mpg)
median_mpg

23.0

In [148]:
std_mpg = np.std(array_mpg)
std_mpg

7.806159061274433

In [149]:
new_cars = df["mpg"]>25                     #---------------> 158 cars with mpg>25.
new_cars.value_counts()

False    240
True     158
Name: mpg, dtype: int64

In [150]:
count = len([i for i in array_mpg if i>25])
print(count)

158


### 2. Filtering
* Using NumPy, filter all cars with more than 6 cylinders.
* Return the corresponding car_name as a list.

In [151]:
df_1 = df[df["cylinders"]>6]
df_1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [152]:
car_name=df_1["car name"].tolist()
car_name

['chevrolet chevelle malibu',
 'buick skylark 320',
 'plymouth satellite',
 'amc rebel sst',
 'ford torino',
 'ford galaxie 500',
 'chevrolet impala',
 'plymouth fury iii',
 'pontiac catalina',
 'amc ambassador dpl',
 'dodge challenger se',
 "plymouth 'cuda 340",
 'chevrolet monte carlo',
 'buick estate wagon (sw)',
 'ford f250',
 'chevy c20',
 'dodge d200',
 'hi 1200d',
 'chevrolet impala',
 'pontiac catalina brougham',
 'ford galaxie 500',
 'plymouth fury iii',
 'dodge monaco (sw)',
 'ford country squire (sw)',
 'pontiac safari (sw)',
 'chevrolet impala',
 'pontiac catalina',
 'plymouth fury iii',
 'ford galaxie 500',
 'amc ambassador sst',
 'mercury marquis',
 'buick lesabre custom',
 'oldsmobile delta 88 royale',
 'chrysler newport royal',
 'amc matador (sw)',
 'chevrolet chevelle concours (sw)',
 'ford gran torino (sw)',
 'plymouth satellite custom (sw)',
 'buick century 350',
 'amc matador',
 'chevrolet malibu',
 'ford gran torino',
 'dodge coronet custom',
 'mercury marquis brou

### 3. Statistical Analysis
* Compute the 25th, 50th, and 75th percentiles of the weight column using NumPy

In [153]:
print("25th percentile is", np.percentile(df["weight"],25))

25th percentile is 2223.75


In [154]:
print("50th percentile is", np.percentile(df["weight"],50))

50th percentile is 2803.5


In [155]:
print("75th percentile is", np.percentile(df["weight"],75))

75th percentile is 3608.0


### 4. Array Manipulation
* Convert the acceleration column into a NumPy array and normalize its values
(scale between 0 and 1).

In [156]:
import numpy as np
array_acceleration = np.array(df["acceleration"])  #--->NumPy array creation
normalizedData = (array_acceleration-np.min(array_acceleration))/(np.max(array_acceleration)-np.min(array_acceleration)) 
print(normalizedData)                          #---------># normalized data using min max value 

[0.23809524 0.20833333 0.17857143 0.23809524 0.14880952 0.11904762
 0.05952381 0.0297619  0.11904762 0.0297619  0.11904762 0.
 0.08928571 0.11904762 0.41666667 0.44642857 0.44642857 0.47619048
 0.38690476 0.74404762 0.56547619 0.38690476 0.56547619 0.26785714
 0.41666667 0.35714286 0.41666667 0.32738095 0.625      0.38690476
 0.44642857 0.35714286 0.6547619  0.29761905 0.44642857 0.44642857
 0.44642857 0.44642857 0.23809524 0.20833333 0.32738095 0.29761905
 0.20833333 0.23809524 0.23809524 0.32738095 0.6547619  0.41666667
 0.38690476 0.35714286 0.35714286 0.68452381 0.38690476 0.6547619
 0.5952381  0.6547619  0.74404762 0.44642857 0.53571429 0.92261905
 0.68452381 0.50595238 0.23809524 0.23809524 0.32738095 0.29761905
 0.20833333 0.17857143 0.32738095 0.32738095 0.26785714 0.32738095
 0.26785714 0.35714286 0.47619048 0.35714286 0.38690476 0.5952381
 0.68452381 0.5952381  0.47619048 0.53571429 0.38690476 0.41666667
 0.50595238 0.29761905 0.20833333 0.29761905 0.38690476 0.26785714
 0.20

 ### 5. Broadcasting
Increase all horsepower values by 10% and store the updated values in a new
NumPy array. Handle missing data (if any) by replacing it with the mean of the
column before applying the increase.

  



In [157]:
df["horsepower"].replace('?', np.nan)
df["horsepower"]= pd.to_numeric(df['horsepower'], errors='coerce') # Convert 'horsepower' column to numeric, forcing errors to NaN (invalid values will become NaN)
horsepower_Mean= df["horsepower"].mean()
df["horsepower"].replace('np.nan', horsepower_Mean)  #replace missing value with mean value.
array_Horsepower= df["horsepower"].to_numpy()
array_new_Horsepower=array_Horsepower*1.10
array_new_Horsepower

array([143. , 181.5, 165. , 165. , 154. , 217.8, 242. , 236.5, 247.5,
       209. , 187. , 176. , 165. , 247.5, 104.5, 104.5, 106.7,  93.5,
        96.8,  50.6,  95.7,  99. , 104.5, 124.3,  99. , 236.5, 220. ,
       231. , 212.3,  96.8,  99. , 104.5,   nan, 110. , 115.5, 110. ,
        96.8, 110. , 181.5, 192.5, 168.3, 165. , 198. , 187. , 192.5,
       121. ,  79.2, 110. ,  96.8,  94.6,  99. ,  77. ,  83.6,  71.5,
        75.9,  66. ,  77. , 104.5,  88. ,  59.4,  99. ,  94.6, 181.5,
       192.5, 165. , 168.3, 165. , 228.8, 170.5, 176. , 209. , 106.7,
       165. , 143. , 154. , 165. , 123.2,  83.6,  95.7,  75.9,  94.6,
       101.2, 106.7,  88. ,  96.8, 192.5, 165. , 159.5, 150.7, 165. ,
       217.8, 165. , 173.8, 165. , 236.5, 247.5, 192.5, 115.5, 110. ,
       110. ,  96.8, 104.5,  50.6, 165. , 183.7, 187. , 198. , 110. ,
        96.8,  79.2, 103.4,  99. ,  93.5, 117.7,  99. , 159.5, 253. ,
        53.9,  82.5, 100.1, 123.2, 165. , 121. , 134.2, 198. , 104.5,
         nan, 110. ,

### 6. Boolean Indexing
Find the average displacement of cars with an origin of 2 (Europe) using NumPy
indexing.

In [158]:
df_2 = df[df["origin"] ==2 ]
df_2.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
19,26.0,4,97.0,46.0,1835,20.5,70,2,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672,17.5,70,2,peugeot 504
21,24.0,4,107.0,90.0,2430,14.5,70,2,audi 100 ls


In [159]:
displacement_mean = np.mean(df_2["displacement"])
displacement_mean

109.14285714285714

### 7.Matrix Operations
* Create a 2D NumPy array containing the columns mpg, horsepower, and weight.
* Compute the dot product of this matrix with a given vector [1, 0.5, -0.2].

In [160]:
import numpy as np
array_2D = np.array([df["mpg"], df["horsepower"], df["weight"]]).T  # Transpose to get proper shape
#array_2D = df[["mpg","horsepower","weight"]].to_numpy()    #Extracting columns as numpy array.
b = [1, 0.5, -0.2]
result = np.dot(array_2D, b)         # Dot product(array_2D ,b)
print(result)

[-617.8 -641.1 -594.2 -595.6 -602.8 -754.2 -746.8 -740.9 -758.5 -660.
 -612.6 -627.8 -662.2 -490.7 -402.9 -497.1 -488.3 -453.9 -355.  -318.
 -465.9 -417.  -402.5 -364.3 -463.6 -805.5 -765.2 -760.4 -840.9 -355.
 -379.8 -373.1    nan -457.8 -619.3 -598.8 -597.4 -589.6 -745.3 -791.3
 -740.3 -730.2 -889.  -851.2 -927.5 -519.4 -423.6 -587.4 -565.8 -378.
 -351.6 -349.8 -345.  -291.1 -253.1 -309.8 -330.  -384.1 -360.2 -400.8
 -416.6 -381.2 -759.3 -775.5 -737.  -735.3 -642.4 -811.6 -809.9 -799.2
 -776.4 -398.5 -688.4 -741.6 -775.8 -726.4 -512.6 -442.2 -531.3 -377.3
 -414.  -383.6 -429.7 -364.8 -349.  -719.5 -645.4 -712.1 -725.9 -665.4
 -879.4 -804.8 -780.6 -758.4 -826.5 -865.7 -663.7 -553.7 -589.6 -521.
 -542.2 -510.3 -341.  -913.4 -885.7 -832.8 -797.8 -489.8 -391.8 -423.2
 -406.8 -361.8 -400.5 -419.9 -382.  -728.9 -724.6 -319.9 -370.1 -450.9
 -498.6 -589.8 -453.  -480.4 -631.8 -552.9    nan -511.2 -602.2 -325.5
 -424.2 -302.7 -445.9 -690.2 -655.4 -652.1 -742.2 -851.8 -802.4 -843.6
 -762.4 -37

In [161]:
import numpy as np
array_2D = np.array([df["mpg"],df["horsepower"],df["weight"]])
b = [1,0.5,-0.2]
array_2D
print (np.dot(b,array_2D))                    # Dot product(array_2D ,b)

[-617.8 -641.1 -594.2 -595.6 -602.8 -754.2 -746.8 -740.9 -758.5 -660.
 -612.6 -627.8 -662.2 -490.7 -402.9 -497.1 -488.3 -453.9 -355.  -318.
 -465.9 -417.  -402.5 -364.3 -463.6 -805.5 -765.2 -760.4 -840.9 -355.
 -379.8 -373.1    nan -457.8 -619.3 -598.8 -597.4 -589.6 -745.3 -791.3
 -740.3 -730.2 -889.  -851.2 -927.5 -519.4 -423.6 -587.4 -565.8 -378.
 -351.6 -349.8 -345.  -291.1 -253.1 -309.8 -330.  -384.1 -360.2 -400.8
 -416.6 -381.2 -759.3 -775.5 -737.  -735.3 -642.4 -811.6 -809.9 -799.2
 -776.4 -398.5 -688.4 -741.6 -775.8 -726.4 -512.6 -442.2 -531.3 -377.3
 -414.  -383.6 -429.7 -364.8 -349.  -719.5 -645.4 -712.1 -725.9 -665.4
 -879.4 -804.8 -780.6 -758.4 -826.5 -865.7 -663.7 -553.7 -589.6 -521.
 -542.2 -510.3 -341.  -913.4 -885.7 -832.8 -797.8 -489.8 -391.8 -423.2
 -406.8 -361.8 -400.5 -419.9 -382.  -728.9 -724.6 -319.9 -370.1 -450.9
 -498.6 -589.8 -453.  -480.4 -631.8 -552.9    nan -511.2 -602.2 -325.5
 -424.2 -302.7 -445.9 -690.2 -655.4 -652.1 -742.2 -851.8 -802.4 -843.6
 -762.4 -37

### 8. Sorting
* Use NumPy to sort the cars by model_year in descending order and display the first five car names.

In [162]:
df_3 = df.sort_values(by="model year",ascending=False)
df_3.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
397,31.0,4,119.0,82.0,2720,19.4,82,1,chevy s-10
382,34.0,4,108.0,70.0,2245,16.9,82,3,toyota corolla
367,28.0,4,112.0,88.0,2605,19.6,82,1,chevrolet cavalier
368,27.0,4,112.0,88.0,2640,18.6,82,1,chevrolet cavalier wagon
369,34.0,4,112.0,88.0,2395,18.0,82,1,chevrolet cavalier 2-door


### 9. Correlation
* Compute the Pearson correlation coefcient between mpg and weight using NumPy

In [163]:
import numpy as np

mpg = np.array(df["mpg"])
weight = np.array(df["weight"])

correlation_matrix = np.corrcoef(mpg, weight)
pearson_correlation = correlation_matrix[0, 1]

print("Pearson correlation coefficient between mpg and weight:", pearson_correlation)

Pearson correlation coefficient between mpg and weight: -0.8317409332443352


### 10.Conditional Aggregates
* Calculate the mean mpg for cars grouped by the number of cylinders using NumPy techniques.

In [164]:
df_4 = df.groupby(["cylinders"])
df_4.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
14,24.0,4,113.0,95.0,2372,15.0,70,3,toyota corona mark ii
15,22.0,6,198.0,95.0,2833,15.5,70,1,plymouth duster
71,19.0,3,70.0,97.0,2330,13.5,72,3,mazda rx2 coupe
274,20.3,5,131.0,103.0,2830,15.9,78,2,audi 5000


In [165]:
print(df_4["mpg"].mean())

cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64


# Pandas Problems

### 1.Basic Exploration.
* The first 10 rows
* The total number of rows and columns
* Summary statistics for numerical columns

In [166]:
df = pd.read_csv("C:/Users/Acer/Downloads/auto-mpg.csv")
df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [167]:
df.shape

(398, 9)

In [168]:
df.describe

<bound method NDFrame.describe of       mpg  cylinders  displacement horsepower  weight  acceleration  \
0    18.0          8         307.0        130    3504          12.0   
1    15.0          8         350.0        165    3693          11.5   
2    18.0          8         318.0        150    3436          11.0   
3    16.0          8         304.0        150    3433          12.0   
4    17.0          8         302.0        140    3449          10.5   
..    ...        ...           ...        ...     ...           ...   
393  27.0          4         140.0         86    2790          15.6   
394  44.0          4          97.0         52    2130          24.6   
395  32.0          4         135.0         84    2295          11.6   
396  28.0          4         120.0         79    2625          18.6   
397  31.0          4         119.0         82    2720          19.4   

     model year  origin                   car name  
0            70       1  chevrolet chevelle malibu  
1      

### 2.Filtering and Indexing.
* Find all cars manufactured in 1975 with a weight less than 3000. Return the DataFrame with selected columns: car_name, weight, and mpg.

In [169]:
filtered_data = df[(df["model year"]== 75) & (df["weight"]<3000)]
filtered_data[["car name","weight","mpg"]]

Unnamed: 0,car name,weight,mpg
167,toyota corolla,2171,29.0
168,ford pinto,2639,23.0
169,amc gremlin,2914,20.0
170,pontiac astro,2592,23.0
171,toyota corona,2702,24.0
172,volkswagen dasher,2223,25.0
173,datsun 710,2545,24.0
174,ford pinto,2984,18.0
175,volkswagen rabbit,1937,29.0
177,audi 100ls,2694,23.0


### 3.Handling Missing Data
* Identify if there are any missing values in the dataset. 
* Replace missing values in the
horsepower column with the column's median.

In [170]:
df.isna().sum()           #isentifying Missing values in the dataset.

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [171]:
# There is a "?" in the horse power column which is also treating missing values.

In [172]:
df.loc[32]

mpg                   25.0
cylinders                4
displacement          98.0
horsepower               ?
weight                2046
acceleration          19.0
model year              71
origin                   1
car name        ford pinto
Name: 32, dtype: object

In [173]:
#Replace missing values in the horsepower column with the column's median.

df["horsepower"]= pd.to_numeric(df['horsepower'], errors='coerce') # Convert 'horsepower' column to numeric, forcing errors to NaN (invalid values will become NaN)
df["horsepower"] = df["horsepower"].replace(np.nan,df["horsepower"].median())  #replace missing value with mean value.
print(df.loc[32])

mpg                   25.0
cylinders                4
displacement          98.0
horsepower            93.5
weight                2046
acceleration          19.0
model year              71
origin                   1
car name        ford pinto
Name: 32, dtype: object


### 4.Data Transformation
* Add a new column power_to_weight_ratio, calculated as horsepower / weight.

In [174]:
df["power_to_weight_ratio"] = df["horsepower"]/df["weight"]
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,power_to_weight_ratio
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0.0371
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0.044679
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0.043655
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0.043694
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0.040591


### 5.Group By
* Group the cars by origin and calculate the mean mpg for each group.

In [175]:
df_5 = df.groupby("origin")
df_5["mpg"].mean()

origin
1    20.083534
2    27.891429
3    30.450633
Name: mpg, dtype: float64

### 6.Sorting
* Sort the DataFrame by mpg in descending order and display the top 10 cars with the highest mpg.

In [176]:
df_sorted = df.sort_values(by = "mpg",ascending = False)
df_sorted[["mpg","car name"]].head(10)

Unnamed: 0,mpg,car name
322,46.6,mazda glc
329,44.6,honda civic 1500 gl
325,44.3,vw rabbit c (diesel)
394,44.0,vw pickup
326,43.4,vw dasher (diesel)
244,43.1,volkswagen rabbit custom diesel
309,41.5,vw rabbit
330,40.9,renault lecar deluxe
324,40.8,datsun 210
247,39.4,datsun b210 gx


### 7. Apply this function to each row and store the result in the new column.
### Apply Function
* Create a new column performance_score using a custom function:
- def performance_score(row):
- return row['mpg'] * row['acceleration'] / row['weight']

In [177]:
def performance_score(row):
    return row['mpg'] *row['acceleration'] / row['weight']

df['performance_score'] = df.apply(performance_score, axis=1)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,power_to_weight_ratio,performance_score
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0.037100,0.061644
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0.044679,0.046710
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0.043655,0.057625
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0.043694,0.055928
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0.040591,0.051754
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl,0.030824,0.150968
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup,0.024413,0.508169
395,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage,0.036601,0.161743
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger,0.030095,0.198400


### 8. Visualization Preparation
* Generate a summary DataFrame with:
* Average mpg, weight, and horsepower for each model_year.

In [178]:
# Group by 'model_year' and calculate the mean of mpg, weight, and horsepower

summary_df = df.groupby('model year')[['mpg', 'weight', 'horsepower']].mean()
print(summary_df)

                  mpg       weight  horsepower
model year                                    
70          17.689655  3372.793103  147.827586
71          21.250000  2995.428571  106.553571
72          18.714286  3237.714286  120.178571
73          17.100000  3419.025000  130.475000
74          22.703704  2877.925926   94.203704
75          20.266667  3176.800000  101.066667
76          21.573529  3078.735294  101.117647
77          23.375000  2997.357143  105.071429
78          24.061111  2861.805556   99.694444
79          25.093103  3055.344828  101.206897
80          33.696552  2436.655172   78.586207
81          30.334483  2522.931034   81.465517
82          31.709677  2453.548387   81.854839


### 9. Exporting Data
* Save a subset of the data containing only mpg, cylinders, horsepower, and weight
for cars with mpg > 30 into a CSV file named high_mpg_cars.csv.

In [179]:
high_mpg_cars = df[df["mpg"]>30]
high_mpg_cars.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,power_to_weight_ratio,performance_score
53,31.0,4,71.0,65.0,1773,19.0,71,3,toyota corolla 1200,0.036661,0.332205
54,35.0,4,72.0,69.0,1613,18.0,71,3,datsun 1200,0.042777,0.390577
129,31.0,4,79.0,67.0,1950,19.0,74,3,datsun b210,0.034359,0.302051


In [180]:
high_mpg_cars_subset = high_mpg_cars [["mpg","cylinders","horsepower","weight"]]
high_mpg_cars_subset.head(3)

Unnamed: 0,mpg,cylinders,horsepower,weight
53,31.0,4,65.0,1773
54,35.0,4,69.0,1613
129,31.0,4,67.0,1950


In [181]:
high_mpg_cars_subset.to_csv("high_mpg_cars.csv",index = False)

### 10. Finding Anomalies
* Identify potential outliers in the mpg column using the Interquartile Range (IQR)
method. Specifically:
* Calculate the IQR for mpg.
* Define outliers as values less than Q1 - 1.5 * IQR or greater than Q3 + 1.5 * IQR.
* Create a DataFrame of cars classified as outliers, displaying car_name, mpg, and model_year.

In [182]:
Q1 = df['mpg'].quantile(0.25)
Q3 = df['mpg'].quantile(0.75)
IQR = Q3 - Q1
lower_Bound = Q1 - 1.5 * IQR
upper_Bound = Q3 + 1.5 * IQR
outliers = df[(df['mpg'] < lower_Bound) | (df['mpg'] > upper_Bound)]
outlier_Cars = outliers[['car name', 'mpg', 'model year']]
print(outlier_Cars)

      car name   mpg  model year
322  mazda glc  46.6          80
