Python

In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Step 2: Load dataset
df = pd.read_csv("Car Insurance.csv")

In [None]:
# Step 3: Explore dataset
print("\n--- First 5 rows ---")
print(df.head())

print("\n--- Info ---")
print(df.info())

print("\n--- Summary statistics ---")
print(df.describe(include="all"))


--- First 5 rows ---
     Make   Age   Mileage Fuel Gearbox Colour Claimed
0  Toyota   2.0   27000.0    P       A    Red     Yes
1    Ford   4.0   30500.0    P       M  Black     Yes
2  Toyota  15.0  120000.0    D       A  White      No
3  Nissan  13.0   53000.0    D       M  White      No
4  Nissan   2.0       NaN    D       M  Black      No

--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Make     10 non-null     object 
 1   Age      9 non-null      float64
 2   Mileage  9 non-null      float64
 3   Fuel     9 non-null      object 
 4   Gearbox  9 non-null      object 
 5   Colour   9 non-null      object 
 6   Claimed  10 non-null     object 
dtypes: float64(2), object(5)
memory usage: 692.0+ bytes
None

--- Summary statistics ---
          Make        Age        Mileage Fuel Gearbox Colour Claimed
count       10   9.000000       9.0000

In [None]:
# Indexing & slicing examples
print("\nElement at row 0, col 0:", df.iloc[0,0])
print("\nFirst column:")
print(df.iloc[:,0])
print("\nSecond row:")
print(df.iloc[1])


Element at row 0, col 0: Toyota

First column:
0    Toyota
1      Ford
2    Toyota
3    Nissan
4    Nissan
5      Ford
6    Toyota
7    Nissan
8    Toyota
9    Nissan
Name: Make, dtype: object

Second row:
Make          Ford
Age            4.0
Mileage    30500.0
Fuel             P
Gearbox          M
Colour       Black
Claimed        Yes
Name: 1, dtype: object


In [None]:
# Step 4: Unique values and counts
for col in df.columns:
    print(f"\nUnique values in {col}: {df[col].unique()}")
    print(df[col].value_counts())


Unique values in Make: ['Toyota' 'Ford' 'Nissan']
Make
Toyota    4
Nissan    4
Ford      2
Name: count, dtype: int64

Unique values in Age: [ 2.  4. 15. 13.  8. nan 20.  7.]
Age
2.0     2
13.0    2
4.0     1
15.0    1
8.0     1
20.0    1
7.0     1
Name: count, dtype: int64

Unique values in Mileage: [ 27000.  30500. 120000.  53000.     nan  73000. 138000.  38000.  67000.
  36000.]
Mileage
27000.0     1
30500.0     1
120000.0    1
53000.0     1
73000.0     1
138000.0    1
38000.0     1
67000.0     1
36000.0     1
Name: count, dtype: int64

Unique values in Fuel: ['P' 'D' nan]
Fuel
D    5
P    4
Name: count, dtype: int64

Unique values in Gearbox: ['A' 'M' nan]
Gearbox
M    6
A    3
Name: count, dtype: int64

Unique values in Colour: ['Red' 'Black' 'White' 'Green' nan 'Blue']
Colour
Black    3
Green    2
White    2
Red      1
Blue     1
Name: count, dtype: int64

Unique values in Claimed: ['Yes' 'No']
Claimed
Yes    5
No     5
Name: count, dtype: int64


In [None]:
# Step 5: Sorting examples
print("\nSorted ascending by Age:")
print(df.sort_values(by="Age").head())


Sorted ascending by Age:
     Make  Age  Mileage Fuel Gearbox Colour Claimed
0  Toyota  2.0  27000.0    P       A    Red     Yes
4  Nissan  2.0      NaN    D       M  Black      No
1    Ford  4.0  30500.0    P       M  Black     Yes
9  Nissan  7.0  36000.0    P       M  Black     Yes
5    Ford  8.0  73000.0  NaN       M  Green      No


In [None]:
print("\nSorted descending by Age:")
print(df.sort_values(by="Age", ascending=False).head())


Sorted descending by Age:
     Make   Age   Mileage Fuel Gearbox Colour Claimed
7  Nissan  20.0   38000.0    D     NaN  Green     Yes
2  Toyota  15.0  120000.0    D       A  White      No
8  Toyota  13.0   67000.0    D       A   Blue      No
3  Nissan  13.0   53000.0    D       M  White      No
5    Ford   8.0   73000.0  NaN       M  Green      No


In [None]:
# Step 6: Statistics on numerical column
print("\nAge column: Min, Max, Mean")
print(df["Age"].min(), df["Age"].max(), df["Age"].mean())


Age column: Min, Max, Mean
2.0 20.0 9.333333333333334


In [None]:
# Step 7: Missing values check
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Make       0
Age        1
Mileage    1
Fuel       1
Gearbox    1
Colour     1
Claimed    0
dtype: int64


In [None]:
# Step 8: Impute missing values
num_cols = ["Age", "Mileage"]
cat_cols = [col for col in df.columns if col not in num_cols + ["Claimed"]]

In [None]:
# Numerical imputation (mean)
num_imputer = SimpleImputer(strategy="mean")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [None]:
# Categorical imputation (most frequent)
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [None]:
print("\nAfter imputation:")
print(df.isnull().sum())


After imputation:
Make       0
Age        0
Mileage    0
Fuel       0
Gearbox    0
Colour     0
Claimed    0
dtype: int64


In [None]:
# Step 9: Encoding categorical variables
# Ordinal encoding example
ord_enc = OrdinalEncoder()
df[cat_cols] = ord_enc.fit_transform(df[cat_cols])

In [None]:
# One-hot encoding example
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_array = ohe.fit_transform(df[cat_cols])
ohe_df = pd.DataFrame(ohe_array, columns=ohe.get_feature_names_out(cat_cols))

# Concatenate with original
df_encoded = pd.concat([df.drop(columns=cat_cols), ohe_df], axis=1)

print("\nAfter encoding:")
print(df_encoded.head())


After encoding:
    Age        Mileage Claimed  Make_1.0  Make_2.0  Fuel_1.0  Gearbox_1.0  \
0   2.0   27000.000000     Yes       0.0       1.0       1.0          0.0   
1   4.0   30500.000000     Yes       0.0       0.0       1.0          1.0   
2  15.0  120000.000000      No       0.0       1.0       0.0          0.0   
3  13.0   53000.000000      No       1.0       0.0       0.0          1.0   
4   2.0   64722.222222      No       1.0       0.0       0.0          1.0   

   Colour_1.0  Colour_2.0  Colour_3.0  Colour_4.0  
0         0.0         0.0         1.0         0.0  
1         0.0         0.0         0.0         0.0  
2         0.0         0.0         0.0         1.0  
3         0.0         0.0         0.0         1.0  
4         0.0         0.0         0.0         0.0  


In [None]:

# Step 10: Feature scaling
scaler_std = StandardScaler()
scaled_std = scaler_std.fit_transform(df_encoded.drop(columns=["Claimed"]))

scaler_mm = MinMaxScaler()
scaled_mm = scaler_mm.fit_transform(df_encoded.drop(columns=["Claimed"]))

print("\nStandardized (first 5 rows):")
print(scaled_std[:5])

print("\nMinMax scaled (first 5 rows):")
print(scaled_mm[:5])

# Final cleaned dataset
print("\nFinal dataset shape:", df_encoded.shape)
print("Columns:", df_encoded.columns.tolist())


Standardized (first 5 rows):
[[-1.30454146e+00 -1.05562430e+00 -8.16496581e-01  1.22474487e+00
   1.22474487e+00 -1.52752523e+00 -3.33333333e-01 -5.00000000e-01
   3.00000000e+00 -5.00000000e-01]
 [-9.48757423e-01 -9.57679776e-01 -8.16496581e-01 -8.16496581e-01
   1.22474487e+00  6.54653671e-01 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01 -5.00000000e-01]
 [ 1.00805476e+00  1.54690159e+00 -8.16496581e-01  1.22474487e+00
  -8.16496581e-01 -1.52752523e+00 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01  2.00000000e+00]
 [ 6.52270728e-01 -3.28036417e-01  1.22474487e+00 -8.16496581e-01
  -8.16496581e-01  6.54653671e-01 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01  2.00000000e+00]
 [-1.30454146e+00 -2.03611484e-16  1.22474487e+00 -8.16496581e-01
  -8.16496581e-01  6.54653671e-01 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01 -5.00000000e-01]]

MinMax scaled (first 5 rows):
[[0.         0.         0.         1.         1.         0.
  0.         0.         1.         0.      

C++

In [29]:
%%writefile Lab01.cpp

#include <bits/stdc++.h>
using namespace std;

// ---------------- CSV Reader ----------------
class CSVReader {
public:
    vector<string> headers;
    vector<vector<string>> data;

    vector<string> split(const string &line, char delim=',') {
        vector<string> tokens;
        string token;
        stringstream ss(line);
        while (getline(ss, token, delim)) tokens.push_back(token);
        return tokens;
    }

    void load(string filename) {
        ifstream fin(filename);
        if (!fin) throw runtime_error("File not found");
        string line;
        getline(fin, line); // header row
        headers = split(line);
        while (getline(fin, line)) {
            if (line.size()) data.push_back(split(line));
        }
    }
};

// ---------------- Imputer ----------------
class Imputer {
public:
    static void mean(vector<double> &col) {
        double sum=0; int count=0;
        for (double v: col) if (!isnan(v)) { sum+=v; count++; }
        double mu = (count>0)? sum/count : 0;
        for (double &v: col) if (isnan(v)) v = mu;
    }
    static void median(vector<double> &col) {
        vector<double> clean;
        for (double v: col) if (!isnan(v)) clean.push_back(v);
        if (clean.empty()) return;
        sort(clean.begin(), clean.end());
        double med = clean[clean.size()/2];
        for (double &v: col) if (isnan(v)) v = med;
    }
    static void mode(vector<string> &col) {
        unordered_map<string,int> freq;
        for (auto &s: col) if (s!="NaN" && !s.empty()) freq[s]++;
        string best=""; int bestCount=0;
        for (auto &p: freq) if (p.second > bestCount) { best=p.first; bestCount=p.second; }
        for (auto &s: col) if (s=="NaN" || s.empty()) s = best;
    }
};

// ---------------- Ordinal Encoder ----------------
class OrdinalEncoder {
    map<string,int> mapping;
public:
    vector<int> fit_transform(const vector<string> &col) {
        int code=0; vector<int> out;
        for (auto &val: col) {
            if (!mapping.count(val)) mapping[val]=code++;
            out.push_back(mapping[val]);
        }
        return out;
    }
};

// ---------------- Scalers ----------------
class StandardScaler {
    double mu, sigma;
public:
    vector<double> fit_transform(const vector<double> &col) {
        double sum=0; for (double v: col) sum+=v;
        mu=sum/col.size();
        double sq=0; for (double v: col) sq+=(v-mu)*(v-mu);
        sigma=sqrt(sq/col.size());
        vector<double> out;
        for (double v: col) out.push_back((v-mu)/sigma);
        return out;
    }
};

class MinMaxScaler {
    double mn, mx;
public:
    vector<double> fit_transform(const vector<double> &col) {
        mn=*min_element(col.begin(), col.end());
        mx=*max_element(col.begin(), col.end());
        vector<double> out;
        for (double v: col) out.push_back((v-mn)/(mx-mn));
        return out;
    }
};

// ---------------- Main ----------------
int main() {
    CSVReader reader;
    reader.load("Car Insurance.csv");
    int n = reader.data.size();
    int m = reader.headers.size();

    cout << "Loaded " << n << " rows, " << m << " columns\n";

    // Safer string-to-double converter
    auto safe_stod = [](const string &s) -> double {
        if (s.empty() || s == "NaN") return NAN;
        try {
            return stod(s);
        } catch (...) {
            return NAN; // if conversion fails, mark missing
        }
    };

    // ---------------- Extract numeric columns ----------------
    vector<double> age(n), mileage(n);
    for (int i=0;i<n;i++) {
        age[i] = safe_stod(reader.data[i][1]);
        mileage[i] = safe_stod(reader.data[i][2]);
    }

    cout << "\n--- Task: Handle Missing Values ---\n";
    Imputer::mean(age);
    Imputer::median(mileage);

    for (int i=0; i<5; i++)
        cout << "Row " << i << ": Age=" << age[i] << ", Mileage=" << mileage[i] << "\n";

    // ---------------- Extract categorical columns ----------------
    vector<string> make(n), fuel(n), gearbox(n), colour(n);
    for (int i=0;i<n;i++) {
        make[i]=reader.data[i][0];
        fuel[i]=reader.data[i][3];
        gearbox[i]=reader.data[i][4];
        colour[i]=reader.data[i][5];
    }

    cout << "\n--- Task: Impute Categorical Values ---\n";
    Imputer::mode(make);
    Imputer::mode(fuel);
    Imputer::mode(gearbox);
    Imputer::mode(colour);

    for (int i=0; i<5; i++)
        cout << "Row " << i << ": Make=" << make[i] << ", Fuel=" << fuel[i]
             << ", Gearbox=" << gearbox[i] << ", Colour=" << colour[i] << "\n";

    // ---------------- Ordinal Encoding ----------------
    OrdinalEncoder encMake, encFuel, encGear, encColour;
    vector<int> makeOrd = encMake.fit_transform(make);
    vector<int> fuelOrd = encFuel.fit_transform(fuel);
    vector<int> gearOrd = encGear.fit_transform(gearbox);
    vector<int> colourOrd = encColour.fit_transform(colour);

    cout << "\n--- Task: Ordinal Encoding ---\n";
    for (int i=0;i<5;i++)
        cout << "Row " << i << ": MakeOrd=" << makeOrd[i] << ", FuelOrd=" << fuelOrd[i]
             << ", GearboxOrd=" << gearOrd[i] << ", ColourOrd=" << colourOrd[i] << "\n";

    // ---------------- Scaling ----------------
    StandardScaler stdScaler;
    vector<double> ageStd = stdScaler.fit_transform(age);
    MinMaxScaler mmScaler;
    vector<double> mileageMM = mmScaler.fit_transform(mileage);

    cout << "\n--- Task: Scaling ---\n";
    for (int i=0;i<5;i++)
        cout << "Row " << i << ": AgeStd=" << ageStd[i] << ", MileageMM=" << mileageMM[i] << "\n";

    // ---------------- Write Cleaned Dataset ----------------
    ofstream fout("CarInsurance_Cleaned.csv");
    fout << "MakeOrd,FuelOrd,GearboxOrd,ColourOrd,AgeStd,MileageMM,Claimed\n";
    for (int i=0;i<n;i++) {
        fout << makeOrd[i] << "," << fuelOrd[i] << "," << gearOrd[i] << ","
             << colourOrd[i] << "," << ageStd[i] << "," << mileageMM[i]
             << "," << reader.data[i].back() << "\n";
    }
    fout.close();

    cout << "\n--- Final Cleaned Dataset Preview ---\n";
    ifstream check("CarInsurance_Cleaned.csv");
    string line;
    for (int i=0; i<6 && getline(check, line); i++) {
        cout << line << "\n";
    }

    return 0;
}

Writing Lab01.cpp


In [30]:
!g++ Lab01.cpp -o Lab01
!./Lab01

Loaded 10 rows, 7 columns

--- Task: Handle Missing Values ---
Row 0: Age=2, Mileage=27000
Row 1: Age=4, Mileage=30500
Row 2: Age=15, Mileage=120000
Row 3: Age=13, Mileage=53000
Row 4: Age=2, Mileage=53000

--- Task: Impute Categorical Values ---
Row 0: Make=Toyota, Fuel=P, Gearbox=A, Colour=Red
Row 1: Make=Ford, Fuel=P, Gearbox=M, Colour=Black
Row 2: Make=Toyota, Fuel=D, Gearbox=A, Colour=White
Row 3: Make=Nissan, Fuel=D, Gearbox=M, Colour=White
Row 4: Make=Nissan, Fuel=D, Gearbox=M, Colour=Black

--- Task: Ordinal Encoding ---
Row 0: MakeOrd=0, FuelOrd=0, GearboxOrd=0, ColourOrd=0
Row 1: MakeOrd=1, FuelOrd=0, GearboxOrd=1, ColourOrd=1
Row 2: MakeOrd=0, FuelOrd=1, GearboxOrd=0, ColourOrd=2
Row 3: MakeOrd=2, FuelOrd=1, GearboxOrd=1, ColourOrd=2
Row 4: MakeOrd=2, FuelOrd=1, GearboxOrd=1, ColourOrd=1

--- Task: Scaling ---
Row 0: AgeStd=-1.30454, MileageMM=0
Row 1: AgeStd=-0.948757, MileageMM=0.0315315
Row 2: AgeStd=1.00805, MileageMM=0.837838
Row 3: AgeStd=0.652271, MileageMM=0.234234
R