In [None]:
import pandas as pd

# Provide the URL of the dataset
url = "https://github.com/dsrscientist/dataset1/raw/master/titanic_train.csv"

# Load the dataset into a Pandas DataFrame
titanic_df = pd.read_csv(url)

# Display the first few rows of the DataFrame to inspect the data
print(titanic_df.head())


In [None]:
# Check the structure of the dataset (number of rows and columns)
print("Number of rows and columns:", titanic_df.shape)

# Inspect the first few rows to get an overview of the data
print("\nFirst few rows of the dataset:\n", titanic_df.head())

# Check for missing values
print("\nMissing values in each column:\n", titanic_df.isnull().sum())


In [None]:
# Display columns with missing values
print("Columns with missing values:\n", titanic_df.isnull().sum())

# Handle missing values in 'Age' by imputing with the mean
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)

# Handle missing values in 'Embarked' by imputing with the mode
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column due to a large number of missing values
titanic_df.drop(columns=['Cabin'], inplace=True)

# Verify that missing values have been handled
print("\nColumns with missing values after handling:\n", titanic_df.isnull().sum())


In [None]:
# Convert categorical variables using one-hot encoding
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'], drop_first=True)

# Display the DataFrame after one-hot encoding
print("\nDataFrame after one-hot encoding:\n", titanic_df.head())


In [None]:
# Create a new feature 'FamilySize'
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

# Display the DataFrame after feature engineering
print("\nDataFrame after feature engineering:\n", titanic_df.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Explore the distribution of numerical features
plt.figure(figsize=(12, 8))
sns.histplot(titanic_df['Age'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Age')
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(x='Pclass', data=titanic_df, palette='viridis')
plt.title('Distribution of Passenger Class (Pclass)')
plt.show()

# Explore the distribution of categorical features
plt.figure(figsize=(12, 8))
sns.countplot(x='Sex', data=titanic_df, palette='pastel')
plt.title('Distribution of Sex')
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(x='Embarked', data=titanic_df, palette='Set2')
plt.title('Distribution of Embarked')
plt.show()


In [None]:
# Visualize relationships between numerical features
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Age', y='Fare', hue='Survived', data=titanic_df, palette='coolwarm')
plt.title('Relationship between Age, Fare, and Survival')
plt.show()

# Visualize relationships between categorical features
plt.figure(figsize=(12, 8))
sns.catplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df, kind='bar', palette='muted')
plt.title('Survival Rate across Passenger Class and Sex')
plt.show()


In [None]:
# Analyze survival rates across different categories
survival_by_class = titanic_df.groupby('Pclass')['Survived'].mean()
survival_by_sex = titanic_df.groupby('Sex')['Survived'].mean()
survival_by_embarked = titanic_df.groupby('Embarked')['Survived'].mean()

print("Survival Rate by Passenger Class:\n", survival_by_class)
print("\nSurvival Rate by Sex:\n", survival_by_sex)
print("\nSurvival Rate by Embarked:\n", survival_by_embarked)


In [None]:
# Calculate correlation between features and target variable
correlation_with_survived = titanic_df.corr()['Survived'].abs().sort_values(ascending=False)

# Display the correlation coefficients
print("Correlation with Survived:\n", correlation_with_survived)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Assuming 'X' contains your features and 'y' is the target variable
X = titanic_df.drop(columns=['Survived'])
y = titanic_df['Survived']

# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Fit the model to the data
rf_model.fit(X, y)

# Extract feature importances
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Display feature importances
print("\nFeature Importance:\n", feature_importance)


In [None]:
# Set a threshold for correlation or importance
correlation_threshold = 0.1  # Adjust as needed
importance_threshold = 0.01  # Adjust as needed

# Select features based on the threshold
selected_features_corr = correlation_with_survived[correlation_with_survived >= correlation_threshold].index
selected_features_importance = feature_importance[feature_importance >= importance_threshold].index

# Display selected features
print("\nSelected Features based on Correlation:\n", selected_features_corr)
print("\nSelected Features based on Importance:\n", selected_features_importance)

# Combine selected features
selected_features = set(selected_features_corr).union(selected_features_importance)
print("\nFinal Selected Features:\n", selected_features)


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'selected_features' contains the features you want to use for prediction
X = titanic_df[selected_features]
y = titanic_df['Survived']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model using the training set
rf_model.fit(X_train, y_train)

# Make predictions on the training set for evaluation
y_train_pred = rf_model.predict(X_train)

# Evaluate the model on the training set
print("Training Set Performance:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))


In [None]:
# Make predictions on the testing set
y_test_pred = rf_model.predict(X_test)

# Evaluate the model on the testing set
print("Testing Set Performance:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')

# Perform grid search on the training set
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model from the grid search
best_rf_model = grid_search.best_estimator_

# Make predictions on the testing set using the best model
y_test_pred_tuned = best_rf_model.predict(X_test)

# Evaluate the tuned model on the testing set
print("\nTuned Model Performance on Testing Set:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_test_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_tuned))


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter distributions
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(randint(1, 30, 20)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# Perform randomized search on the training set
random_search.fit(X_train, y_train)

# Print the best hyperparameters found by the randomized search
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model from the randomized search
best_rf_model = random_search.best_estimator_

# Make predictions on the testing set using the best model
y_test_pred_tuned = best_rf_model.predict(X_test)

# Evaluate the tuned model on the testing set
print("\nTuned Model Performance on Testing Set:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_test_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_tuned))


In [None]:
# Assuming 'new_data' contains the new observations with the same features used during training
# Make sure 'new_data' has the same preprocessing applied as the training data

# Use the trained model to make predictions on new data
new_data_predictions = best_rf_model.predict(new_data[selected_features])

# Display the predictions
print("Predictions on new data:\n", new_data_predictions)


In [None]:
# Extract feature importances from the best model
feature_importance = pd.Series(best_rf_model.feature_importances_, index=selected_features).sort_values(ascending=False)

# Display feature importances
print("Feature Importance:\n", feature_importance)

# Plot feature importances for better visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
feature_importance.plot(kind='barh')
plt.title("Feature Importance in Predicting Survival")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()
