1. **Logistic Regression**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum', 'MaritalStatus', 'Occupation', 
                'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income']

# Load data from the UCI repository (or use a local file path)
data = pd.read_csv(url, names=column_names, na_values=" ?", header=None)

# Display basic information
print("Data shape:", data.shape)
print(data.head())

# Handle missing values by filling with the most frequent value in each column
data.fillna(data.mode().iloc[0], inplace=True)

# Encode categorical variables using LabelEncoder
encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])



Data shape: (32561, 15)
   Age          WorkClass  fnlwgt   Education  EducationNum  \
0   39          State-gov   77516   Bachelors            13   
1   50   Self-emp-not-inc   83311   Bachelors            13   
2   38            Private  215646     HS-grad             9   
3   53            Private  234721        11th             7   
4   28            Private  338409   Bachelors            13   

         MaritalStatus          Occupation    Relationship    Race      Sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   CapitalGain  CapitalLoss  HoursPerWeek   NativeCountry  Income  
0         2174            0            40   United-Sta

2. **Training the model (Data Splitting and Data Fitting)**

In [None]:
# Separate features (X) and target (y)
X = data.drop(columns=['Income'])
y = data['Income']

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test data
y_test_pred = logistic_model.predict(X_test_scaled)


3. **Print performance metrics**

In [13]:

print("\n\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\n\nClassification Report:\n", classification_report(y_test, y_test_pred))




Test Accuracy: 0.8258866881621373


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.94      0.89      4942
           1       0.72      0.46      0.56      1571

    accuracy                           0.83      6513
   macro avg       0.78      0.70      0.73      6513
weighted avg       0.81      0.83      0.81      6513



4. **Save the predictions on the test set to a CSV file**

In [11]:

test_predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_best})
test_predictions.to_csv('test_predictions.csv', index=False)

print("\nPredictions saved to 'test_predictions.csv'.")


Predictions saved to 'test_predictions.csv'.
