### 1. Import Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("Car_sales.csv")

Unnamed: 0,Manufacturer,Model,Sales in thousands,4-year resale value,Vehicle type,Price in thousands,Engine size,Horsepower,Wheelbase,Width,Length,Curb weight,Fuel capacity,Fuel efficiency,Latest Launch
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140,101.2,67.3,172.4,2.639,13.2,28,2-Feb-14
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225,108.1,70.3,192.9,3.517,17.2,25,6-Mar-15
2,Acura,CL,14.114,18.225,Passenger,.,3.2,225,106.9,70.6,192,3.47,17.2,26,1-Apr-14
3,Acura,RL,8.588,29.725,Passenger,42,3.5,210,114.6,71.4,196.6,3.85,18,22,3-Oct-15
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150,102.6,68.2,178,2.998,16.4,27,10-Aug-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Volvo,V40,3.545,.,Passenger,24.4,1.9,160,100.5,67.6,176.6,3.042,15.8,25,21-Sep-15
153,Volvo,S70,15.245,.,Passenger,27.5,2.4,168,104.9,69.3,185.9,3.208,17.9,25,24-Nov-14
154,Volvo,V70,17.531,.,Passenger,28.8,2.4,168,104.9,69.3,186.2,3.259,17.9,25,25-Jun-15
155,Volvo,C70,3.493,.,Passenger,45.5,2.3,236,104.9,71.5,185.7,3.601,18.5,23,26-Apr-15


### 1. Read the Top Five Values

In [6]:
print(df.head())

    Manufacturer              Model  Sales in thousands 4-year resale value  \
0  Acura          Integra                        16.919               16.36   
1  Acura          TL                             39.384              19.875   
2  Acura          CL                             14.114              18.225   
3  Acura          RL                              8.588              29.725   
4  Audi           A4                             20.397              22.255   

  Vehicle type Price in thousands Engine size Horsepower Wheelbase Width  \
0    Passenger               21.5         1.8        140     101.2  67.3   
1    Passenger               28.4         3.2        225     108.1  70.3   
2    Passenger                  .         3.2        225     106.9  70.6   
3    Passenger                 42         3.5        210     114.6  71.4   
4    Passenger              23.99         1.8        150     102.6  68.2   

  Length Curb weight Fuel capacity Fuel efficiency Latest Launch  
0

### 2. Print DataFrame Info & Data Types of Each Column

In [7]:
print(df.info())
print(df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         157 non-null    object 
 1   Model                157 non-null    object 
 2   Sales in thousands   157 non-null    float64
 3   4-year resale value  157 non-null    object 
 4   Vehicle type         157 non-null    object 
 5   Price in thousands   157 non-null    object 
 6   Engine size          157 non-null    object 
 7   Horsepower           157 non-null    object 
 8   Wheelbase            157 non-null    object 
 9   Width                157 non-null    object 
 10  Length               157 non-null    object 
 11  Curb weight          157 non-null    object 
 12  Fuel capacity        157 non-null    object 
 13  Fuel efficiency      157 non-null    object 
 14  Latest Launch        157 non-null    object 
dtypes: float64(1), object(14)
memory usage: 

### 3. Print Number of Rows and Columns

In [8]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 157
Number of columns: 15


### 4. Drop Duplicate Rows (if any)

In [9]:
df.drop_duplicates(inplace=True)

### 5Print Number of Rows and Columns After Dropping Duplicates

In [10]:
print(f"After removing duplicates, rows: {df.shape[0]}, columns: {df.shape[1]}")

After removing duplicates, rows: 157, columns: 15


### 6. Print Summary Statistics for Numerical Variables

In [11]:
print(df.describe())

       Sales in thousands
count          157.000000
mean            52.998076
std             68.029422
min              0.110000
25%             14.114000
50%             29.450000
75%             67.956000
max            540.561000


### 7 Print Number of Missing Values in Each Column

In [12]:
print(df.isnull().sum())

Manufacturer           0
Model                  0
Sales in thousands     0
4-year resale value    0
Vehicle type           0
Price in thousands     0
Engine size            0
Horsepower             0
Wheelbase              0
Width                  0
Length                 0
Curb weight            0
Fuel capacity          0
Fuel efficiency        0
Latest Launch          0
dtype: int64


### 8  Drop the Column with Most Missing Values

In [13]:
df.drop(columns=[df.isnull().sum().idxmax()], inplace=True)

### 9 Drop Rows with Missing Categorical Values

In [14]:
df.dropna(subset=df.select_dtypes(include=['object']).columns, inplace=True)

 ### 10 Fill Missing Values in Numerical Columns with Mean

In [17]:
df[df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).fillna(df.mean(numeric_only=True))

### 11. Sort Data with Respect to Price & Find the Most and Least Expensive Cars

In [18]:
df_sorted = df.sort_values(by="price", ascending=True)
print("Least Expensive Car:\n", df_sorted.head(1))
print("Most Expensive Car:\n", df_sorted.tail(1))

KeyError: 'price'

### 12. Function to Find Min & Max Values of Any Column

In [19]:
def find_min_max(col_name):
    return df[col_name].min(), df[col_name].max()

### 13. Call Function for Horsepower, Length, Fuel Efficiency

In [20]:
print("Horsepower Min, Max:", find_min_max("horsepower"))
print("Length Min, Max:", find_min_max("length"))
print("Fuel Efficiency Min, Max:", find_min_max("fuel_efficiency"))

KeyError: 'horsepower'

### 14. Plot Histogram of Continuous Numerical Variables

In [None]:
df[['price', 'sales', 'horsepower', 'fuel_efficiency']].hist(figsize=(10, 6), bins=20)
plt.show()

### 15. Probability Density Distribution of Length 

In [None]:
sns.kdeplot(df["length"], shade=True)
plt.show()

### 16. Count by Category – Group by Manufacturer

In [None]:
print(df.groupby("manufacturer").size())

### 17. Select All Numerical Variables

In [None]:
numerical_df = df.select_dtypes(include=[np.number])
print(numerical_df.head())

### 18. Print Correlation Coefficient Value of Price & Sales

In [None]:
correlation = df["price"].corr(df["sales"])
print("Correlation between price and sales:", correlation)

### 19. Plot Correlation of Price & Sales Using Scatterplot

In [None]:
sns.scatterplot(x=df["price"], y=df["sales"])
plt.show()

### 20. Pair Plot for Numerical Variables

In [None]:
sns.pairplot(numerical_df)
plt.show()

### 21. Boxplot of Sales by Different Manufacturers

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x=df["manufacturer"], y=df["sales"])
plt.xticks(rotation=45)
plt.show()

### 22. Boxplot of Other Numerical Variables w.r.t Manufacturer

In [None]:
num_vars = ["price", "horsepower", "fuel_efficiency", "length"]
for var in num_vars:
    plt.figure(figsize=(12,6))
    sns.boxplot(x=df["manufacturer"], y=df[var])
    plt.xticks(rotation=45)
    plt.show()

### 23. Divide Data into Input (X) & Output (Y = Sales in Thousands)

In [None]:
y = df["sales"]
X = df.drop(columns=["sales"])

### 24. Encode Categorical Variables Using Label Encoder

In [None]:
label_encoder = LabelEncoder()
categorical_cols = X.select_dtypes(include=['object']).columns

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

### 25. Encode Vehicle Type Using One-Hot Encoding

In [None]:
X = pd.get_dummies(X, columns=["vehicle_type"], drop_first=True)

### 26. Split Data into Train (70%) and Test (30%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 27. Apply Feature Scaling on Numerical Variables

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled=scalar.transdorm(X)