<a href="https://colab.research.google.com/github/Mahfujul-01726/Random/blob/main/SynergyX2024_Datathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier  # Use RandomForestRegressor for regression tasks
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report  # For classification tasks
from sklearn.impute import SimpleImputer  # For imputing missing target values

# Load the dataset
data = pd.read_csv("train.csv")

# Separate features and target
X = data.drop(columns=['v16'], errors='ignore')  # Feature matrix without target column
y = data['v16'] if 'v16' in data.columns else None  # Target variable

# Drop non-numeric columns from features, assuming 'id' is retained separately for later merging
X_numeric = X.select_dtypes(include=[int, float])

# Impute missing values in the feature matrix using the mean (for numeric features)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_numeric)

# Initialize the SimpleImputer for the target variable 'v16' (impute with the most frequent class)
y_imputer = SimpleImputer(strategy='most_frequent')  # For classification, you can also use 'mean' for regression
y_imputed = y_imputer.fit_transform(data[['v16']])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed.ravel(), test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier (use RandomForestRegressor if it's a regression task)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust parameters as needed

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model (for classification)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# If it's a regression problem, you can use the following metrics instead:
# from sklearn.metrics import mean_squared_error, r2_score
# print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
# print("R^2 Score:", r2_score(y_test, y_pred))


Accuracy: 0.9772685196364895
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99    121237
         1.0       0.89      0.07      0.12      2996

    accuracy                           0.98    124233
   macro avg       0.93      0.53      0.56    124233
weighted avg       0.98      0.98      0.97    124233



# Starting

 # final_df=train

In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset correctly
data = pd.read_csv("train.csv")

# Separate features and target
X = data.drop(columns=['v16'], errors='ignore')  # Feature matrix without target column
y = data['v16'] if 'v16' in data.columns else None  # Target variable

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(exclude=[int, float]).columns
print("Non-numeric columns:", non_numeric_cols)

# Apply Label Encoding for non-numeric columns (if needed for ordinal data)
label_encoder = LabelEncoder()
for col in non_numeric_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Select only numeric columns (now encoded if non-numeric)
X_numeric = X.select_dtypes(include=[int, float])

# Impute missing values in the feature matrix using the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_numeric)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply PCA to retain 95% of the variance
pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)

# Impute missing values in the target variable 'v16' using RandomForestClassifier
if y is not None:
    if y.isna().any():
        # Separate rows with and without missing values in y
        X_no_nan = X_pca[y.notna()]  # Rows without missing values in y
        y_no_nan = y.dropna()        # Target values without NaN
        X_nan = X_pca[y.isna()]      # Rows with missing values in y

        # Train a classifier to predict missing values
        classifier = RandomForestClassifier(random_state=42)
        classifier.fit(X_no_nan, y_no_nan)

        # Predict missing values in y and impute them
        y_pred_nan = classifier.predict(X_nan)
        y_imputed = y.copy()
        y_imputed.loc[y.isna()] = y_pred_nan
    else:
        y_imputed = y  # No missing values in y
else:
    print("Target variable 'v16' not found.")
    y_imputed = None

# Apply SMOTE to the PCA-transformed data and the imputed target variable
if y_imputed is not None:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_pca, y_imputed)

    # Create a DataFrame for the resampled data
    pca_columns = [f'PC{i+1}' for i in range(X_resampled.shape[1])]
    X_resampled_df = pd.DataFrame(data=X_resampled, columns=pca_columns)
    y_resampled_df = pd.Series(y_resampled, name='v16')

    # Combine the resampled PCA components with the target variable
    final_df = pd.concat([X_resampled_df, y_resampled_df.reset_index(drop=True)], axis=1)
else:
    print("Target variable 'v16' is not available for resampling with SMOTE.")

# Output the final DataFrame
print("Resampled DataFrame with balanced target variable:\n", final_df)


Non-numeric columns: Index(['v15', 'v39', 'v41', 'v42'], dtype='object')
Resampled DataFrame with balanced target variable:
               PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0       -1.917209 -0.148872  0.516312  0.900320 -0.179700 -2.005388 -0.220570   
1        1.331313  1.845028  0.879955  1.764853 -0.310367 -0.228063  0.335680   
2       -2.782442 -0.114113 -0.260353 -0.124727 -0.317031 -2.491713 -0.176647   
3       -1.265401  2.236560 -0.135987 -0.078908  0.164585  1.105585  0.600814   
4       -1.428050  0.589388  1.320871  2.083077 -0.129150  0.269370  0.069195   
...           ...       ...       ...       ...       ...       ...       ...   
1133085 -0.894332 -2.589127  0.478656  0.473043  0.059328  0.580407 -0.201536   
1133086  3.062639 -4.250206  0.176884  0.384854 -0.079298 -1.788413 -0.466390   
1133087  3.457507 -3.530111 -0.361433 -0.406244  0.059948  1.009309 -0.044757   
1133088 -1.685137 -2.718116 -1.167727 -1.834311  0.039681  0.1083

In [37]:
import pandas as pd

# Assuming 'data' is your DataFrame
nan_check = final_df.isna().sum().sum()

# If nan_check > 0, there are missing values in the dataset
if nan_check > 0:
    print(f"The dataset contains {nan_check} missing values.")
else:
    print("The dataset contains no missing values.")


The dataset contains no missing values.


In [38]:
import pandas as pd

# Example: Assume 'data' is your DataFrame and 'target_column' is the column you want to check
value_counts = final_df['v16'].value_counts()
print(value_counts)


v16
0.0    566545
1.0    566545
Name: count, dtype: int64


# final_df1=test

In [6]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("test.csv")

# Remove the target variable 'v16' if it exists
data_without_target = data.drop(columns=['v16'], errors='ignore')

# Identify non-numeric columns
non_numeric_cols = data_without_target.select_dtypes(exclude=[int, float]).columns
print("Non-numeric columns:", non_numeric_cols)

# Apply Label Encoding for non-numeric columns
label_encoder = LabelEncoder()
for col in non_numeric_cols:
    data_without_target[col] = label_encoder.fit_transform(data_without_target[col])

# Impute missing values in the dataset using the mean
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data_without_target)

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed)

# Apply PCA to retain 95% of the variance
pca = PCA(0.95)
data_pca = pca.fit_transform(data_scaled)

# Create a DataFrame with the principal components
pca_columns = [f'PC{i+1}' for i in range(data_pca.shape[1])]
data_pca_df = pd.DataFrame(data=data_pca, columns=pca_columns)

# Combine with the original 'id' column if it exists
if 'id' in data.columns:
    final_df1 = pd.concat([data[['id']].reset_index(drop=True), data_pca_df], axis=1)
else:
    print("Warning: 'id' column not found in the original data. Proceeding without it.")
    final_df1 = data_pca_df

# Output the final DataFrame
print(final_df1)


Non-numeric columns: Index(['v15', 'v39', 'v41', 'v42'], dtype='object')
            id       PC1       PC2       PC3       PC4       PC5       PC6  \
0            1  1.432645 -0.944836 -1.075168 -0.808539 -0.292322 -1.958712   
1            2 -2.035477 -1.778049  0.868146  0.873947 -0.002807 -0.064939   
2            3  5.343929  2.460963  0.656341  1.147806 -0.089056 -2.395672   
3            4  1.810176  3.296826  0.021966  0.300480 -0.149771 -1.026782   
4            5  3.923146  0.778475  0.062735  0.265831  0.056479 -0.341798   
...        ...       ...       ...       ...       ...       ...       ...   
186729  186730 -0.706463 -1.396845  1.504251  1.444677  0.161426  0.540021   
186730  186731 -0.335514 -2.439772  0.482738  0.486245  0.097402  0.052223   
186731  186732  0.177281  0.094525 -1.510632 -1.817764  0.157714 -0.105115   
186732  186733 -0.017424  2.528710  0.905368  0.949250  0.047906 -0.310260   
186733  186734  2.833654  1.776910  3.467190  3.628133  0.274323  1.0

# Apply random forest algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming `final_df` is your training data and `final_df1` is your testing data

# Split `final_df` into features (X_train) and target (y_train)
X_train = final_df.drop(columns=['v16'], errors='ignore')  # Drop target column from training data
y_train = final_df['v16']  # Target variable

# Split `final_df1` into features (X_test)
X_test = final_df1  # Test data (only features, no target column in `final_df1`)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
# Since `final_df1` doesn't contain the true target labels, we'll use the predicted results directly.
print("Predictions on the test data:", y_pred)

# Optionally, if you have a target variable in `final_df1` for evaluation, you could check accuracy
# For example:
# y_test = final_df1['v16']  # True labels in the test data (if available)
# print("Accuracy on test set:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))
