### Q1. What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when you might choose one over the other.

In [1]:
#Ordinal Encoding 

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Create a DataFrame with categorical data
data = {
    'education': ['High School', "Bachelor's Degree", "Master's Degree", 'PhD']
}

df = pd.DataFrame(data)

# Apply Ordinal Encoding to the categorical feature
ordinal_encoder = OrdinalEncoder()

df['education_encoded'] = ordinal_encoder.fit_transform(df[['education']])

print(df)


           education  education_encoded
0        High School                1.0
1  Bachelor's Degree                0.0
2    Master's Degree                2.0
3                PhD                3.0


In [2]:
#Label Encoding

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Create a DataFrame with categorical data
data = {
    'color': ['Red', 'Blue', 'Green']
}

df = pd.DataFrame(data)

# Apply Label Encoding to the categorical feature
label_encoder = LabelEncoder()

df['color_encoded'] = label_encoder.fit_transform(df['color'])

print(df)


   color  color_encoded
0    Red              2
1   Blue              0
2  Green              1


### Q2. Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in a machine learning project.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = {'Feature': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
        'Target': [1, 0, 1, 0, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data)

mean_target = df.groupby('Feature')['Target'].mean()
df['Feature_Encoded'] = df['Feature'].map(mean_target)

X_train, X_test, y_train, y_test = train_test_split(df[['Feature_Encoded']], df['Target'], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


Accuracy: 0.0


### Q3. Define covariance and explain why it is important in statistical analysis. How is covariance calculated?

### Q4. For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium, large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library. Show your code and explain the output.

In [7]:
from sklearn.preprocessing import LabelEncoder

data = {'Color': ['red', 'green', 'blue', 'green', 'red'],
        'Size': ['small', 'medium', 'large', 'small', 'medium'],
        'Material': ['wood', 'metal', 'plastic', 'plastic', 'metal']}
df = pd.DataFrame(data)

label_encoder = LabelEncoder()


df_encoded = df.apply(label_encoder.fit_transform)

print(df_encoded)


   Color  Size  Material
0      2     2         2
1      1     1         0
2      0     0         1
3      1     2         1
4      2     1         0


### Q5. Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education level. Interpret the results.

In [15]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'Age': [30, 35, 40, 45, 50],
    'Income': [50000, 60000, 70000, 80000, 90000],
    'Education': [12, 14, 16, 18, 20]
})

covariance_matrix = df[['Age', 'Income', 'Education']].cov()

print(covariance_matrix)


                Age       Income  Education
Age            62.5     125000.0       25.0
Income     125000.0  250000000.0    50000.0
Education      25.0      50000.0       10.0


### Q6. You are working on a machine learning project with a dataset containing several categorical variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD), and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for each variable, and why?

In [16]:
import pandas as pd

# Create a DataFrame with categorical variables
df = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Education Level': ['High School', "Bachelor's", "Master's", 'PhD', "Bachelor's"],
    'Employment Status': ['Unemployed', 'Part-Time', 'Full-Time', 'Full-Time', 'Part-Time']
})


df['Gender_Encoded'] = df['Gender'].replace({'Male': 0, 'Female': 1})

gender_encoded = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, gender_encoded], axis=1)


education_order = ['High School', "Bachelor's", "Master's", 'PhD']
df['Education_Encoded'] = df['Education Level'].map(lambda x: education_order.index(x))

df['Education_Encoded'] = df['Education Level'].astype('category').cat.codes


employment_encoded = pd.get_dummies(df['Employment Status'], prefix='Employment')
df = pd.concat([df, employment_encoded], axis=1)

print(df)


   Gender Education Level Employment Status  Gender_Encoded  Gender_Female  \
0    Male     High School        Unemployed               0              0   
1  Female      Bachelor's         Part-Time               1              1   
2    Male        Master's         Full-Time               0              0   
3  Female             PhD         Full-Time               1              1   
4    Male      Bachelor's         Part-Time               0              0   

   Gender_Male  Education_Encoded  Employment_Full-Time  Employment_Part-Time  \
0            1                  1                     0                     0   
1            0                  0                     0                     1   
2            1                  2                     1                     0   
3            0                  3                     1                     0   
4            1                  0                     0                     1   

   Employment_Unemployed  
0                

### Q7. You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/East/West). Calculate the covariance between each pair of variables and interpret the results.

In [21]:
import pandas as pd

import pandas as pd

# Create a dictionary with the data
data = {
    'Temperature': [25.5, 26.2, 24.8, 27.3, 23.9],
    'Humidity': [60, 55, 62, 58, 65],
    'Weather Condition': ['Sunny', 'Cloudy', 'Rainy', 'Sunny', 'Cloudy'],
    'Wind Direction': ['North', 'South', 'East', 'West', 'North']
}

df = pd.DataFrame(data)

continuous_vars = ['Temperature', 'Humidity']

cov_continuous = df[continuous_vars].cov()

categorical_vars = ['Weather Condition', 'Wind Direction']

dummy_vars = pd.get_dummies(df[categorical_vars])

data = pd.concat([df[continuous_vars], dummy_vars], axis=1)

cov_continuous_categorical = data.cov()

print("Covariance between continuous variables:")
print(cov_continuous)

print("\nCovariance between continuous and categorical variables:")
print(cov_continuous_categorical)


Covariance between continuous variables:
             Temperature  Humidity
Temperature        1.693    -4.125
Humidity          -4.125    14.500

Covariance between continuous and categorical variables:
                          Temperature      Humidity  Weather Condition_Cloudy  \
Temperature                     1.693 -4.125000e+00             -2.450000e-01   
Humidity                       -4.125  1.450000e+01             -2.775558e-17   
Weather Condition_Cloudy       -0.245 -2.775558e-17              3.000000e-01   
Weather Condition_Rainy        -0.185  5.000000e-01             -1.000000e-01   
Weather Condition_Sunny         0.430 -5.000000e-01             -2.000000e-01   
Wind Direction_East            -0.185  5.000000e-01             -1.000000e-01   
Wind Direction_North           -0.420  1.250000e+00              5.000000e-02   
Wind Direction_South            0.165 -1.250000e+00              1.500000e-01   
Wind Direction_West             0.440 -5.000000e-01             -1.