<a href="https://colab.research.google.com/github/LAGISHETTYNANDITHA/Data-analysis-using-python/blob/main/T_P_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, chi2_contingency
from statsmodels.stats.weightstats import ztest

# Load the dataset
file_path = "/content/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')
df.iloc[:, :] = imputer.fit_transform(df)

# Encoding categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Extract features and target variable
target = 'Mental Stress Level'
X = df.drop(columns=[target])
y = df[target]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1-score": f1_score(y_test, y_pred, average='weighted')
    }

# Print performance metrics
for model, metrics in results.items():
    print(f"\nModel: {model}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Feature importance from Random Forest
rf_model = models["Random Forest"]
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 3 Important Features:")
print(feature_importance.head(3))

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print("\nExplained Variance Ratio:", pca.explained_variance_ratio_)

# One-sample T-test
t_stat, p_value = ttest_1samp(df[target], popmean=5)
print("\nT-test Results:")
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

# Z-test: Difference in study hours between students who attended counseling and those who did not
attended = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
not_attended = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
z_stat, p_ztest = ztest(attended, not_attended)
print("\nZ-test Results:")
print(f"Z-statistic: {z_stat:.4f}, P-value: {p_ztest:.4f}")

# Chi-square test for relationship between gender and counseling attendance
gender_counseling = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2_stat, p_chi, dof, expected = chi2_contingency(gender_counseling)
print("\nChi-Square Test Results:")
print(f"Chi-square statistic: {chi2_stat:.4f}, P-value: {p_chi:.4f}")



Model: Logistic Regression
Accuracy: 0.0329
Precision: 0.0381
Recall: 0.0329
F1-score: 0.0349

Model: Random Forest
Accuracy: 0.1382
Precision: 0.1562
Recall: 0.1382
F1-score: 0.1423

Model: SVM
Accuracy: 0.0658
Precision: 0.0689
Recall: 0.0658
F1-score: 0.0628

Top 3 Important Features:
Student ID              0.093849
Study Hours Per Week    0.086018
Age                     0.070060
dtype: float64

Explained Variance Ratio: [0.99196641 0.00669523]

T-test Results:
T-statistic: 3.4489, P-value: 0.0006

Z-test Results:
Z-statistic: 1.1762, P-value: 0.2395

Chi-Square Test Results:
Chi-square statistic: 8.9103, P-value: 0.2592
