In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.impute import SimpleImputer

# Create a sample dataset
data = {
    'Feature1': [5, 7, 8, np.nan, 10, 7, 5, 6],
    'Feature2': [1, 2, 1, 3, 4, np.nan, 2, 1],
    'Feature3': [3, 2, 1, 4, 3, 2, 4, 5],
    'Target': [1, 0, 1, 0, 1, 0, 1, 0]
}

# Load the dataset into a DataFrame
df = pd.DataFrame(data)

# Task 1: Identify columns with missing values
missing_values = df.isnull().sum()
print("Columns with missing values and their count:")
print(missing_values[missing_values > 0])

# Task 2: Replace missing values with mean (for numerical columns) or mode (for categorical columns)
# Identifying numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Replace missing numerical values with mean
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Verify if missing values are handled
print("\nMissing values after imputation:")
print(df.isnull().sum())

# Task 3: Compare model performance with and without handling missing values

# Split the data into features (X) and target (y)
X = df.drop('Target', axis=1)
y = df['Target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model without handling missing values (using the original dataset)
df_original = pd.DataFrame(data)  # Load original dataset
X_original = df_original.drop('Target', axis=1)
y_original = df_original['Target']

# Impute missing values in the original dataset before splitting into train/test
imputer = SimpleImputer(strategy='mean')
X_original_imputed = imputer.fit_transform(X_original)

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_original_imputed, y_original, test_size=0.2, random_state=42)

# Train model on original dataset (with imputation)
model = LogisticRegression(max_iter=200)
model.fit(X_train_orig, y_train_orig)
y_pred_orig = model.predict(X_test_orig)

# Evaluate accuracy on original dataset
accuracy_without_handling = accuracy_score(y_test_orig, y_pred_orig)
print(f"\nAccuracy without handling missing values: {accuracy_without_handling:.4f}")

# Train model after handling missing values (already done in the df)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate accuracy after handling missing values
accuracy_with_handling = accuracy_score(y_test, y_pred)
print(f"\nAccuracy after handling missing values: {accuracy_with_handling:.4f}")

Columns with missing values and their count:
Feature1    1
Feature2    1
dtype: int64

Missing values after imputation:
Feature1    0
Feature2    0
Feature3    0
Target      0
dtype: int64

Accuracy without handling missing values: 0.0000

Accuracy after handling missing values: 0.0000


In [None]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
    
    

In [None]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
    
    
    

In [None]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
    
    