In [None]:
import numpy as np
import pandas as pd
from boruta import BorutaPy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.utils import shuffle
from sklearn.linear_model import Lasso
pd.pandas.set_option('display.max_columns',None)

In [None]:
insure=pd.read_csv('/kaggle/input/mountains-vs-beaches-preference/mountains_vs_beaches_preferences.csv')

In [None]:
insure = insure.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
insure.head(10)

In [None]:
insure.shape

In [None]:
insure.isnull().sum()

In [None]:
insure.duplicated().sum()

In [None]:
insure.nunique()

In [None]:
insure.info()

In [None]:
insure.tail(10)

In [None]:
label_encode=LabelEncoder()

In [None]:
columns_to_encode = ['Preferred_Activities','Favorite_Season','Education_Level','Location','Gender']
for col in columns_to_encode:
    insure[col] = label_encode.fit_transform(insure[col])

In [None]:
insure.head()

In [None]:
x=insure
y=insure.Preference

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2 ,random_state=3)

In [None]:
print(x.shape,x_train.shape,x_test.shape)

In [None]:
print(insure.Preference.std())

In [None]:
scaler=StandardScaler()

In [None]:
scaler.fit(x_train)

In [None]:
x_train_standarized=scaler.transform(x_train)

In [None]:
print(x_train_standarized)

In [None]:
x_test_standarized=scaler.transform(x_test)
print(x_test_standarized)

In [None]:
print(x_train_standarized.std())
print(x_test_standarized.std())

In [None]:
insure['Preference'].value_counts()

In [None]:
mountains=insure[insure.Preference==0]
beaches=insure[insure.Preference==1]

In [None]:
print(mountains.shape,beaches.shape)

In [None]:
mountains_sample=mountains.sample(n=13148)

In [None]:
new_dataset=pd.concat([mountains_sample,beaches],axis=0)

In [None]:
new_dataset['Preference'].value_counts() 

In [None]:
print(mountains_sample.shape,beaches.shape)

In [None]:
threshold=0.1
selector = VarianceThreshold(threshold)
selected_features = selector.fit_transform(new_dataset)
selected_feature_names = new_dataset.columns[selector.get_support()]
print("\nSelected Features:")
print(selected_feature_names)

In [None]:
correlation_matrix = insure.corr()
print(correlation_matrix)
threshold = 0.8
correlated_features = set()

In [None]:
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            feature = correlation_matrix.columns[i]
            correlated_features.add(feature)

print("Highly correlated features to drop:", correlated_features)

In [None]:
mutual_info = mutual_info_classif(x, y)

# Print scores
for feature, score in zip(x.columns, mutual_info):
    print(f"Feature: {feature}, Mutual Information Score: {score:.2f}")

# Select top 2 features using SelectKBest
selector = SelectKBest(score_func=mutual_info_classif, k=2)
x_new = selector.fit_transform(x, y)

print("Selected features after Mutual Information:", x.columns[selector.get_support()])

In [None]:
model = RandomForestClassifier()

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=2)
x_rfe = rfe.fit_transform(x, y)

# Print selected features
print("Selected features:", x.columns[rfe.support_])

In [None]:
# Initialize model (Logistic Regression)
model = LogisticRegression()

# Apply Forward Selection
selector = SequentialFeatureSelector(model, n_features_to_select=2, direction='forward')
x_new = selector.fit_transform(x, y)

# Print selected features
print("Selected features after Forward Selection:", x.columns[selector.get_support()])

In [None]:
print(x.shape)  # Should output (47199, num_features)
print(y.shape)  # Should output (47199,)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
columns_to_drop = ['Gender', 'Education_Level','Location','Favorite_Season']
new_dataset.drop(columns=columns_to_drop, inplace=True)

In [None]:
new_dataset.head(10)

In [None]:
sns.boxplot(new_dataset)
plt.xticks(rotation=45)  # Rotate labels by 45 degrees

In [None]:
x= new_dataset.drop(columns='Preference', axis=1)
y = new_dataset['Preference']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, stratify=y, random_state=1)

In [None]:
print(y.value_counts())  # For pandas

In [None]:
print(x.shape, x_train.shape, x_test.shape)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

In [None]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [None]:
print('Accuracy on training data : ', training_data_accuracy)

In [None]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy on test data : ', test_data_accuracy)

In [None]:
if len(x) > len(y):
    x = x[:len(y)]
elif len(y) > len(x):
    y = y[:len(x)]


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

In [None]:
scores = cross_val_score(model, x, y, cv=kfold, scoring='accuracy')

In [None]:
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())

In [None]:
input_data = (46,46562,0,1,1469,71,280,0,0)   #1=beaches 0=mountains


input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==0):
  print('mountains')
else:
  print('Beaches')