# Data Loading and Exploration

In [None]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/dataset4/main/Grades.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
print("Head of the dataset:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Summary statistics for numerical columns
print("\nSummary Statistics:")
print(df.describe())

# Unique values in each column
print("\nUnique Values in Each Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Distribution of CGPA
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df['CGPA'], bins=20, kde=True)
plt.title('Distribution of CGPA')
plt.xlabel('CGPA')
plt.ylabel('Frequency')
plt.show()


# Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Handle missing values (drop rows with missing values for simplicity)
df = df.dropna()

# Convert course codes using label encoding
label_encoder = LabelEncoder()
for col in df.columns:
    if 'AB-' in col:
        df[col] = label_encoder.fit_transform(df[col])

# Split the data into features (X) and target variable (y)
X = df.drop('CGPA', axis=1)
y = df['CGPA']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target variable (y)
X = df.drop('CGPA', axis=1)
y = df['CGPA']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


# Model Selection

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create a Random Forest Regressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


# Model Training

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)


# Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


# Prediction

In [None]:
# Assuming you have a new dataset called 'X_new' for predictions
# Replace 'X_new' with your actual new dataset or the test set

# Predict CGPA for new data or the test set
y_pred_new = best_rf_model.predict(X_new)

# Display the predicted CGPA values
print("Predicted CGPA for new data or the test set:")
print(y_pred_new)
