# Question 1

In [2]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif

data = pd.read_csv('property.csv')
X = data.select_dtypes(include=['float64', 'int64'])

X = X.drop(columns=['price'])  # Assuming 'price' is the target variable
y = data['price']

ig_selector = SelectKBest(mutual_info_classif, k='all')
ig_selector.fit(X, y)
ig_scores = pd.DataFrame({'Feature': X.columns, 'IG Score': ig_selector.scores_})
ig_selected_features = ig_scores.sort_values(by='IG Score', ascending=False)['Feature'].tolist()

print("Selected features using Information Gain (IG) method:")
print(ig_selected_features)

Selected features using Information Gain (IG) method:
['location_id', 'latitude', 'longitude', 'area_sqft', 'area_marla', 'bedrooms', 'baths', 'property_id', 'day', 'year', 'month']


# Question 2

In [3]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2

data = pd.read_csv('property.csv')

X = data.select_dtypes(include=['float64', 'int64'])

X = X.drop(columns=['price'])
y = data['price']

chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(X, y)
chi2_scores = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_selector.scores_})
chi2_selected_features = chi2_scores.sort_values(by='Chi2 Score', ascending=False)['Feature'].tolist()

print("Selected features using χ^2 (chi-square) method:")
print(chi2_selected_features)

Selected features using χ^2 (chi-square) method:
['area_sqft', 'property_id', 'location_id', 'area_marla', 'baths', 'bedrooms', 'day', 'latitude', 'month', 'longitude', 'year']


# Question 3

In [4]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

data = pd.read_csv('property.csv')

X = data.select_dtypes(include=['float64', 'int64'])

correlation_matrix = X.corr()
variance_selector = VarianceThreshold()
variance_selector.fit(X)
variance_selected_features = X.columns[variance_selector.get_support()].tolist()

print("Selected features using Correlation & Variance methods:")
print(variance_selected_features)

Selected features using Correlation & Variance methods:
['property_id', 'location_id', 'price', 'latitude', 'longitude', 'baths', 'area_marla', 'area_sqft', 'bedrooms', 'year', 'month', 'day']


# Question 4

In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Read the dataset
data = pd.read_csv('property.csv')

# Select numeric features
numeric_data = data.select_dtypes(include=['number'])

# Check for missing values
missing_values = numeric_data.isnull().sum()
print("Missing values in numeric data:")
print(missing_values)

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
numeric_data_imputed = pd.DataFrame(imputer.fit_transform(numeric_data), columns=numeric_data.columns)

# Define target variable and features
X = numeric_data_imputed.drop(columns=['price'])  # Replace 'price' with the name of your target variable
y = numeric_data_imputed['price']

# Calculate correlation with target variable and variance
correlation = X.corrwith(y)
variance = X.var()

# Select features based on correlation and variance thresholds
selected_features = X.columns[(correlation.abs() > 0.5) & (variance > 0.1)]

# Display selected features
print("Selected Features based on Correlation and Variance Methods:")
print(selected_features)


Missing values in numeric data:
property_id    0
location_id    0
price          0
latitude       0
longitude      0
baths          0
area_marla     0
area_sqft      0
bedrooms       0
year           0
month          0
day            0
dtype: int64
Selected Features based on Correlation and Variance Methods:
Index([], dtype='object')


# Question 5

In [11]:
import pandas as pd

# Step 1: Load the dataset
data = pd.read_csv('diabetes.csv')

# Step 2: Separate features (X) and target variable (y)
X = data.drop(columns=['Outcome'])  # Assuming 'Outcome' is the target variable
y = data['Outcome']

# Step 3: Calculate the mean absolute difference (MAD) for each feature
mad_values = X.apply(lambda x: (x - x.mean()).abs().mean())

# Step 4: Select the features with the highest MAD values
selected_features = mad_values.nlargest(5).index  # Selecting top 5 features

# Display selected features
print("Selected Features based on Mean Absolute Difference (MAD) Method:")
print(selected_features)


Selected Features based on Mean Absolute Difference (MAD) Method:
Index(['Insulin', 'Glucose', 'SkinThickness', 'BloodPressure', 'Age'], dtype='object')


# Question 6

In [12]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset
data = pd.read_csv('churn.csv')

# Step 2: Choose a continuous target variable for dispersion ratio calculation
# Let's choose 'Day Mins' as the target variable for this example
y = data['Day Mins']

# Step 3: Calculate the dispersion ratio for the chosen target variable
# Calculate the Interquartile Range (IQR)
iqr_y = y.quantile(0.75) - y.quantile(0.25)
# Calculate the Median Absolute Deviation (MAD)
mad_y = np.median(np.abs(y - y.median()))
# Calculate the dispersion ratio
dispersion_ratio_y = iqr_y / mad_y

# Step 4: Display the dispersion ratio for the chosen target variable
print("Dispersion Ratio for the target variable (Day Mins):", dispersion_ratio_y)


Dispersion Ratio for the target variable (Day Mins): 2.002754820936639
