In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv('train1.csv')
test_data = pd.read_csv('test1.csv')

# Rename columns to ensure consistency
train_data.rename(columns={
    'preparation': 'test preparation',
    'parental level of education': 'parental level of education',
    'gender': 'gender',
    'lunch': 'lunch',
    'Section': 'Section'
}, inplace=True)

test_data.rename(columns={
    'preparation': 'test preparation',
    'parental level of education': 'parental level of education',
    'gender': 'gender',
    'lunch': 'lunch',
    'Section': 'Section'
}, inplace=True)
print("Train Data:")
print(train_data.head())
print(train_data.dtypes)

print("Test Data:")
print(test_data.head())
print(test_data.dtypes)


Train Data:
     Roll no preparation   gender parental level of education         lunch  \
0  EXA000001         none    male                some college      standard   
1  EXA000002         none    male             master's degree  free/reduced   
2  EXA000003         none    male             master's degree  free/reduced   
3  EXA000004         none  female                some college  free/reduced   
4  EXA000005         none  female                 high school      standard   

     Section  practical score  viva score  exam score  
0  Section A               70          73          70  
1  Section C               55          54          52  
2  Section E               56          46          43  
3  Section C               35          47          41  
4  Section C               87          92          81  
Roll no                        object
preparation                    object
gender                         object
parental level of education    object
lunch                    

In [3]:
categorical_columns_label = ['parental level of education']  # Columns for Label Encoding
categorical_columns_onehot = ['gender', 'Section', 'lunch', 'preparation']  # Columns for One-Hot Encoding

# Label Encoding for ordinal columns
label_encoders = {}
for column in categorical_columns_label:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    test_data[column] = le.transform(test_data[column].astype(str))
    label_encoders[column] = le

# One-Hot Encoding for nominal columns
onehot_encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns_onehot)
    ],
    remainder='passthrough'
)


In [5]:
# Check if 'test preparation' column is in train_data and test_data
if 'preparation' in train_data.columns:
    X_train_features = train_data.drop(['Roll no', 'exam score'], axis=1)
else:
    # Handle case when 'test preparation' is not in train_data
    print("'test preparation' column not found in train_data")
    X_train_features = train_data.drop(['Roll no', 'exam score'], axis=1, errors='ignore')  # Adjust if needed

if 'preparation' in test_data.columns:
    X_test_features = test_data.drop(['Roll no'], axis=1)
else:
   
    print("'test preparation' column not found in test_data")
    X_test_features = test_data.drop(['Roll no'], axis=1, errors='ignore')  # Adjust if needed

# Print the columns of X_train_features to verify
print(X_train_features.columns)


'test preparation' column not found in train_data
'test preparation' column not found in test_data
Index(['preparation ', 'gender', 'parental level of education', 'lunch',
       'Section', 'practical score', 'viva score'],
      dtype='object')


In [6]:
print("X_train_features Columns:")
print(X_train_features.columns)
print("X_test_features Columns:")
print(X_test_features.columns)

X_train_features Columns:
Index(['preparation ', 'gender', 'parental level of education', 'lunch',
       'Section', 'practical score', 'viva score'],
      dtype='object')
X_test_features Columns:
Index(['preparation ', 'gender', 'parental level of education', 'lunch',
       'Section', 'practical score', 'viva score'],
      dtype='object')


In [8]:
categorical_columns_label = ['parental level of education']  # Columns for Label Encoding
categorical_columns_onehot = ['gender', 'Section', 'lunch','preparation ']  # Columns for One-Hot Encoding

# Label Encoding for ordinal columns
label_encoders = {}
for column in categorical_columns_label:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    test_data[column] = le.transform(test_data[column].astype(str))
    label_encoders[column] = le
label_encoders1 = {}
for column in categorical_columns_onehot:
    le1 = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    test_data[column] = le.transform(test_data[column].astype(str))
    label_encoders1[column] = le1

# Print the first few rows after Label Encoding
print("Train Data After Label Encoding:")
print(train_data.head())
print("Test Data After Label Encoding:")
print(test_data.head())

Train Data After Label Encoding:
     Roll no  preparation   gender  parental level of education  lunch  \
0  EXA000001             1       1                            4      1   
1  EXA000002             1       1                            3      0   
2  EXA000003             1       1                            3      0   
3  EXA000004             1       0                            4      0   
4  EXA000005             1       0                            2      1   

   Section  practical score  viva score  exam score  
0        0               70          73          70  
1        2               55          54          52  
2        4               56          46          43  
3        2               35          47          41  
4        2               87          92          81  
Test Data After Label Encoding:
    Roll no  preparation   gender  parental level of education  lunch  \
0  EXA32000             1       1                            0      1   
1  EXA32001         

In [10]:
# Strip any extra whitespace from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()


In [11]:
columns_to_drop_train = ['Roll no', 'exam score']
columns_to_drop_test = ['Roll no']

# Only drop columns that exist
columns_to_drop_train = [col for col in columns_to_drop_train if col in train_data.columns]
columns_to_drop_test = [col for col in columns_to_drop_test if col in test_data.columns]

X_train_features = train_data.drop(columns=columns_to_drop_train, axis=1)
X_test_features = test_data.drop(columns=columns_to_drop_test, axis=1)

In [12]:
print("X_train_features Columns:")
print(X_train_features.columns)
print("X_test_features Columns:")
print(X_test_features.columns)

X_train_features Columns:
Index(['preparation', 'gender', 'parental level of education', 'lunch',
       'Section', 'practical score', 'viva score'],
      dtype='object')
X_test_features Columns:
Index(['preparation', 'gender', 'parental level of education', 'lunch',
       'Section', 'practical score', 'viva score'],
      dtype='object')


In [22]:
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
# Apply One-Hot Encoding
X_train_encoded = onehot_encoder.fit_transform(X_train_features)
X_test_encoded = onehot_encoder.transform(X_test_features)
y_train = train_data['exam score']



In [26]:
print("X_train_encoded shape:", X_train_encoded.shape)
print("X_test_encoded shape:", X_test_encoded.shape)
print("y_train shape:", y_train.shape)


X_train_encoded shape: (31999, 160)
X_test_encoded shape: (100, 160)
y_train shape: (25599,)


In [28]:
# Extract 'exam score' column
if 'exam score' in train_data.columns:
    y_train = train_data['exam score'].values
else:
    raise ValueError("'exam score' column is not present in train_data")

# Ensure y_train matches the number of rows in X_train_encoded
if len(y_train) != X_train_encoded.shape[0]:
    raise ValueError("Mismatch between number of rows in X_train_encoded and y_train")


In [33]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

# Sample DataFrames (replace with actual data)
# train_data = pd.DataFrame({...})
# test_data = pd.DataFrame({...})

# Define columns to drop
columns_to_drop_train = ['Roll no', 'exam score']
columns_to_drop_test = ['Roll no']

# Filter columns to drop
columns_to_drop_train = [col for col in columns_to_drop_train if col in train_data.columns]
columns_to_drop_test = [col for col in columns_to_drop_test if col in test_data.columns]

# Drop columns
X_train_features = train_data.drop(columns=columns_to_drop_train, axis=1)
X_test_features = test_data.drop(columns=columns_to_drop_test, axis=1)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(drop='first', sparse=False)

# Apply One-Hot Encoding
X_train_encoded = onehot_encoder.fit_transform(X_train_features)
X_test_encoded = onehot_encoder.transform(X_test_features)

# Extract target variable
if 'exam score' in train_data.columns:
    y_train = train_data['exam score'].values
else:
    raise ValueError("'exam score' column is not present in train_data")

# Standardize features
scaler = StandardScaler()
X_train_encoded = scaler.fit_transform(X_train_encoded)
X_test_encoded = scaler.transform(X_test_encoded)

# Check consistency
print("X_train_encoded shape:", X_train_encoded.shape)
print("y_train shape:", y_train.shape)


X_train_encoded shape: (31999, 160)
y_train shape: (31999,)




In [50]:


# Standardize features
scaler = StandardScaler()
X_train_encoded = scaler.fit_transform(X_train_encoded)
X_test_encoded = scaler.transform(X_test_encoded)

# Check shapes before splitting
print("X_train_encoded shape:", X_train_encoded.shape)
print("y_train shape:", len(y_train))

# Ensure shapes are consistent
if X_train_encoded.shape[0] != len(y_train):
    raise ValueError("Mismatch between number of rows in X_train_encoded and y_train")

# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_encoded, y_train, test_size=0.2, random_state=42)

# Verify shapes after split
print("X_train_split shape:", X_train_split.shape)
print("X_val_split shape:", X_val_split.shape)
print("y_train_split shape:", y_train_split.shape)
print("y_val_split shape:", y_val_split.shape)

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = model.predict(X_val_split)


X_train_encoded shape: (31999, 160)
y_train shape: 31999
X_train_split shape: (25599, 160)
X_val_split shape: (6400, 160)
y_train_split shape: (25599,)
y_val_split shape: (6400,)


In [51]:

mae = mean_absolute_error(y_val, y_val_pred)
print(f'Mean Absolute Error on validation set: {mae:.2f}')


Mean Absolute Error on validation set: 11.96
