In [1]:
import os
import subprocess
import sys

# 1. Automatically install required packages
required_packages = [
    "pandas", "numpy", "matplotlib", "seaborn",
    "scikit-learn", "fpdf", "joblib"
]

def install_packages(packages):
    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install_packages(required_packages)

# Import libraries after installation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import joblib

# 2. Create directory structure for the project
project_dir = "Data_Science_Project"
subdirs = ["data", "notebooks", "models", "reports", "visualizations"]

for subdir in subdirs:
    os.makedirs(os.path.join(project_dir, subdir), exist_ok=True)

# 3. Simulate a dataset for user retention analysis
np.random.seed(42)
data = pd.DataFrame({
    'user_id': np.arange(1, 1001),
    'session_count': np.random.poisson(5, 1000),
    'days_since_last_session': np.random.randint(1, 30, 1000),
    'notification_clicks': np.random.randint(0, 10, 1000),
    'feedback_score': np.random.choice([1, 2, 3, 4, 5], 1000)
})

# Save the dataset to the 'data' folder
data.to_csv(os.path.join(project_dir, "data", "user_retention_data.csv"), index=False)

# 4. Function to create visualizations
def plot_visualizations(data, output_dir):
    # Histogram for user sessions distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(data['session_count'], kde=True, color='skyblue')
    plt.title('User Sessions Distribution')
    plt.xlabel('Number of Sessions')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, "user_sessions_distribution.png"))
    plt.close()

    # Correlation matrix
    plt.figure(figsize=(10, 6))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.savefig(os.path.join(output_dir, "correlation_matrix.png"))
    plt.close()

    # Bar chart for feedback score distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data['feedback_score'], palette='viridis')
    plt.title('Feedback Score Distribution')
    plt.xlabel('Feedback Score')
    plt.ylabel('Count')
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, "feedback_score_distribution.png"))
    plt.close()

# Generate visualizations
plot_visualizations(data, os.path.join(project_dir, "visualizations"))

# 5. Data preparation for modeling
X = data[['session_count', 'days_since_last_session', 'notification_clicks']]
y = (data['feedback_score'] > 3).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Model training using RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. Model evaluation
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# 8. Confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
conf_matrix_path = os.path.join(project_dir, "visualizations", "confusion_matrix.png")
plt.savefig(conf_matrix_path)
plt.close()

# 9. ROC curve for model performance
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
roc_curve_path = os.path.join(project_dir, "visualizations", "roc_curve.png")
plt.legend(loc="lower right")
plt.savefig(roc_curve_path)
plt.close()

# 10. Save the trained model
model_path = os.path.join(project_dir, "models", "user_retention_model.pkl")
joblib.dump(model, model_path)

# 11. Create a README.md file with project information
readme_content = """
# Data Science Project: User Retention Analysis

## Project Description
This project simulates a user retention analysis in a mobile app. The goal is to identify patterns of user retention, segment users, and predict retention likelihood.

## Objectives
- Analyze user retention patterns.
- Train a classification model to predict user retention.
- Provide visual insights into user behavior.

## How to Run
1. Execute this script to run the analysis and model training.
2. Review the generated visualizations and the report PDF in the respective folders.

## Requirements
- Python 3.6 or higher
- Libraries: pandas, numpy, matplotlib, seaborn, scikit-learn, fpdf, joblib

## Results
The project provides insights into user retention and a trained model to predict user behavior.
"""

readme_path = os.path.join(project_dir, "README.md")
with open(readme_path, "w") as f:
    f.write(readme_content)

# 12. Detailed PDF report creation
pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)

# Report title
pdf.set_font("Arial", size=16, style='B')
pdf.cell(0, 10, "Data Science Project: User Retention Analysis", ln=True, align='C')

# Introduction
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Introduction", ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, "This project aims to analyze user retention in a mobile app, focusing on user behavior patterns and prediction of retention likelihood. "
                      "The goal is to identify factors affecting retention and provide actionable insights for improving engagement.")

# Methodology
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Methodology", ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, "The project involves data collection, cleaning, exploration, and modeling. The Random Forest model was selected for its effectiveness in classification tasks.")

# Analysis and Results
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Analysis and Results", ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, f"The classification model achieved an accuracy of {model.score(X_test, y_test):.2f}. "
                      "Key insights include high correlation between session count and retention likelihood.")

pdf.add_page()
pdf.image(conf_matrix_path, x=10, y=10, w=180)
pdf.ln(85)
pdf.image(roc_curve_path, x=10, y=100, w=180)

# Conclusions and Recommendations
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Conclusions and Recommendations", ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, "Improving notification strategies and increasing session engagement could enhance user retention.")

# Save the PDF report
pdf_file_path = os.path.join(project_dir, "reports", "User_Retention_Analysis_Report.pdf")
pdf.output(pdf_file_path)

print(f"Project created successfully in '{project_dir}'")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data['feedback_score'], palette='viridis')


Project created successfully in 'Data_Science_Project'


In [3]:
import shutil
from google.colab import files

shutil.make_archive("19", 'zip', ".")

files.download("19.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from fpdf import FPDF

# 1. Simulate a dataset
np.random.seed(42)
data = pd.DataFrame({
    'user_id': np.arange(1, 1001),
    'session_count': np.random.poisson(5, 1000),
    'days_since_last_session': np.random.randint(1, 30, 1000),
    'notification_clicks': np.random.randint(0, 10, 1000),
    'feedback_score': np.random.choice([1, 2, 3, 4, 5], 1000)
})

# 2. Create visualizations and save as images
# User Sessions Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['session_count'], kde=True, color='skyblue')
plt.title('User Sessions Distribution')
plt.xlabel('Number of Sessions')
plt.ylabel('Frequency')
plt.grid(True)
session_dist_path = "user_sessions_distribution.png"
plt.savefig(session_dist_path)
plt.close()

# Correlation Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
correlation_matrix_path = "correlation_matrix.png"
plt.savefig(correlation_matrix_path)
plt.close()

# Feedback Score Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data['feedback_score'], palette='viridis')
plt.title('Feedback Score Distribution')
plt.xlabel('Feedback Score')
plt.ylabel('Count')
feedback_dist_path = "feedback_score_distribution.png"
plt.savefig(feedback_dist_path)
plt.close()

# 3. Create the detailed PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)

# 4. Report title page
pdf.add_page()
pdf.set_font("Arial", size=24, style='B')
pdf.set_text_color(0, 102, 204)  # Dark blue color
pdf.cell(0, 20, "Data Science Project: User Retention Analysis", ln=True, align='C')

# Subtitle
pdf.set_font("Arial", size=16, style='B')
pdf.set_text_color(0, 0, 0)  # Black color
pdf.cell(0, 10, "A Comprehensive Analysis of User Behavior", ln=True, align='C')
pdf.ln(10)

# 5. Introduction (1-2 pages)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10,
    "The main goal of this project is to analyze user retention in a mobile application. "
    "We aim to identify key factors affecting user engagement and retention, using data-driven insights. "
    "The analysis provides actionable recommendations to improve user interaction and business outcomes.\n\n"
    "This project uses a simulated dataset to explore retention patterns and predict user behavior. "
    "The insights derived from this analysis can guide strategic decisions for product enhancement, marketing optimization, "
    "and overall customer satisfaction."
)
pdf.ln(10)

# Key objectives
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Key Objectives", ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10,
    "- Identify user retention patterns and key influencing factors.\n"
    "- Develop a classification model to predict user retention likelihood.\n"
    "- Provide visual insights and recommendations for improving user engagement.\n"
)

# 6. Methodology (2-3 pages)
pdf.add_page()
pdf.set_font("Arial", size=14, style='B')
pdf.cell(0, 10, "Methodology", ln=True)
pdf.ln(5)

pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10,
    "The project follows a structured approach that includes data collection, cleaning, exploration, modeling, and evaluation. "
    "The dataset simulates user interactions, with variables such as session count, days since last session, notification clicks, "
    "and feedback scores.\n\n"
    "The modeling phase uses a Random Forest Classifier, chosen for its robustness and accuracy in classification tasks. "
    "The model is evaluated using metrics such as accuracy, confusion matrix, and ROC-AUC curve."
)
pdf.ln(10)

# 7. Analysis and Results (3-5 pages)
pdf.add_page()

# Add user sessions distribution chart
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "User Sessions Distribution", ln=True)
pdf.image(session_dist_path, x=10, y=30, w=180)
pdf.ln(85)

# Add correlation matrix
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Correlation Matrix", ln=True)
pdf.image(correlation_matrix_path, x=10, y=120, w=180)
pdf.ln(95)

# New page for additional charts
pdf.add_page()

# Add feedback score distribution chart
pdf.set_font("Arial", size=12, style='B')
pdf.cell(0, 10, "Feedback Score Distribution", ln=True)
pdf.image(feedback_dist_path, x=10, y=30, w=180)
pdf.ln(85)

# 8. Conclusions and Recommendations (1-2 pages)
pdf.add_page()
pdf.set_font("Arial", size=14, style='B')
pdf.cell(0, 10, "Conclusions and Recommendations", ln=True)
pdf.ln(5)

pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10,
    "The analysis reveals that session count and notification engagement are key factors influencing user retention. "
    "Users with higher session counts and frequent interactions with notifications have a greater likelihood of retention.\n\n"
    "Recommendations include enhancing notification strategies, optimizing user experience, and personalizing content "
    "to increase engagement. Future analysis could benefit from real-world data to validate these findings and refine the model.\n"
    "\nNext steps involve implementing predictive models in real-time within the app and testing new engagement strategies based on insights gained."
)

# 9. Save the detailed PDF
pdf_file_path = "Detailed_User_Retention_Analysis_Report.pdf"
pdf.output(pdf_file_path)

print(f"Detailed PDF report created successfully: {pdf_file_path}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data['feedback_score'], palette='viridis')


Detailed PDF report created successfully: Detailed_User_Retention_Analysis_Report.pdf
