<a href="https://colab.research.google.com/github/Marjan-Salari/Detection-social-anxiety-disorder-in-X/blob/main/Untitled147.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# نصب کتابخانه‌های لازم
# ===============================
!pip install xgboost scikit-learn matplotlib seaborn gensim openpyxl --quiet

# ===============================
# ایمپورت کتابخانه‌ها
# ===============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import resample
from gensim.models import Word2Vec
from google.colab import files
import os

# ===============================
# مرحله 1: بارگذاری فایل‌ها
# ===============================
print("لطفا فایل آموزش (train) را انتخاب کنید:")
train_file = files.upload()
train_filename = list(train_file.keys())[0]

print("لطفا فایل اعتبارسنجی (validation) را انتخاب کنید:")
val_file = files.upload()
val_filename = list(val_file.keys())[0]

print("لطفا فایل تست (test) را انتخاب کنید:")
test_file = files.upload()
test_filename = list(test_file.keys())[0]

def load_file(filename):
    if filename.endswith('.csv'):
        return pd.read_csv(filename)
    elif filename.endswith(('.xlsx', '.xls')):
        return pd.read_excel(filename)
    else:
        raise ValueError("File must be CSV or Excel")

train_df = load_file(train_filename)
val_df = load_file(val_filename)
test_df = load_file(test_filename)

# ===============================
# مرحله 2: توازن کلاس‌ها در دیتای آموزش
# ===============================
def balance_data(df, target='label'):
    class_counts = df[target].value_counts()
    if abs(class_counts[0] - class_counts[1]) / max(class_counts) > 0.1:
        df_majority = df[df[target]==0]
        df_minority = df[df[target]==1]
        df_majority_downsampled = resample(df_majority,
                                           replace=False,
                                           n_samples=len(df_minority),
                                           random_state=42)
        df_balanced = pd.concat([df_majority_downsampled, df_minority])
        return df_balanced.sample(frac=1, random_state=42)
    else:
        return df

train_df = balance_data(train_df)

# ===============================
# مرحله 3: آماده‌سازی داده‌ها با حفظ توکن‌ها
# ===============================
X_train = train_df['text']
y_train = train_df['label']

X_val = val_df['text']
y_val = val_df['label']

X_test = test_df['text']
y_test = test_df['label'] if 'label' in test_df.columns else None

# برای Vectorizer موقتا لیست توکن را به رشته تبدیل می‌کنیم
def tokens_to_string(tokens):
    if isinstance(tokens, list):
        return " ".join(tokens)
    else:
        return str(tokens)

X_train_temp = X_train.apply(tokens_to_string)
X_val_temp = X_val.apply(tokens_to_string)
X_test_temp = X_test.apply(tokens_to_string)

# ===============================
# مرحله 4: تعریف ویژگی‌ها
# ===============================
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_temp)
X_val_tfidf = tfidf_vectorizer.transform(X_val_temp)
X_test_tfidf = tfidf_vectorizer.transform(X_test_temp)

# N-gram (CountVectorizer)
ngram_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train_temp)
X_val_ngram = ngram_vectorizer.transform(X_val_temp)
X_test_ngram = ngram_vectorizer.transform(X_test_temp)

# Skip-gram (Word2Vec متوسط بردار کلمات)
def get_w2v_embeddings(texts, size=100, window=5, min_count=1):
    sentences = [t if isinstance(t, list) else str(t).split() for t in texts]
    w2v_model = Word2Vec(sentences, vector_size=size, window=window, min_count=min_count, workers=4)
    embeddings = []
    for sent in sentences:
        vecs = [w2v_model.wv[word] for word in sent if word in w2v_model.wv]
        if len(vecs) > 0:
            embeddings.append(np.mean(vecs, axis=0))
        else:
            embeddings.append(np.zeros(size))
    return np.array(embeddings)

X_train_w2v = get_w2v_embeddings(X_train)
X_val_w2v = get_w2v_embeddings(X_val)
X_test_w2v = get_w2v_embeddings(X_test)

# ===============================
# مرحله 5: تعریف مدل‌ها
# ===============================
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

features = {
    'TF-IDF': (X_train_tfidf, X_val_tfidf, X_test_tfidf),
    'N-gram': (X_train_ngram, X_val_ngram, X_test_ngram),
    'Skip-gram': (X_train_w2v, X_val_w2v, X_test_w2v)
}

results = []

# پوشه برای ذخیره تصاویر
os.makedirs("plots", exist_ok=True)

# ===============================
# مرحله 6: آموزش و ارزیابی
# ===============================
for feat_name, (X_tr, X_vl, X_te) in features.items():
    for model_name, model in models.items():
        print(f"\n=== Model: {model_name} | Feature: {feat_name} ===")
        model.fit(X_tr, y_train)
        y_pred_val = model.predict(X_vl)

        acc = accuracy_score(y_val, y_pred_val)
        prec = precision_score(y_val, y_pred_val)
        rec = recall_score(y_val, y_pred_val)
        f1 = f1_score(y_val, y_pred_val)
        cm = confusion_matrix(y_val, y_pred_val)

        print(f"Accuracy: {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall: {rec:.4f}")
        print(f"F1-score: {f1:.4f}")
        print("Confusion Matrix:")
        print(cm)

        # ذخیره نتایج
        results.append({
            'Model': model_name,
            'Feature': feat_name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1-score': f1
        })

        # ذخیره نمودار ماتریس
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title(f'Confusion Matrix: {model_name} | {feat_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(f'plots/ConfusionMatrix_{model_name}_{feat_name}.png')
        plt.close()

# ===============================
# مرحله 7: ذخیره نتایج و نمودارها
# ===============================
results_df = pd.DataFrame(results)
results_df.to_csv("model_results.csv", index=False)
results_df.to_excel("model_results.xlsx", index=False)

# ذخیره نتایج در فایل TXT
with open("model_results.txt", "w") as f:
    for r in results:
        f.write(f"Model: {r['Model']} | Feature: {r['Feature']} | Accuracy: {r['Accuracy']:.4f} | Precision: {r['Precision']:.4f} | Recall: {r['Recall']:.4f} | F1-score: {r['F1-score']:.4f}\n")

# نمودار مقایسه‌ای خطی و میله‌ای
plt.figure(figsize=(12,6))
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-score']:
    for feat_name in features.keys():
        subset = results_df[results_df['Feature']==feat_name]
        plt.plot(subset['Model'], subset[metric], marker='o', label=f"{metric}-{feat_name}")
plt.title("Model Comparison Line Chart")
plt.xlabel("Model")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.savefig("plots/Model_Comparison_Line.png")
plt.close()

# نمودار میله‌ای
plt.figure(figsize=(12,6))
metrics_plot = results_df.melt(id_vars=['Model','Feature'], value_vars=['Accuracy','Precision','Recall','F1-score'], var_name='Metric', value_name='Score')
sns.barplot(x='Model', y='Score', hue='Feature', data=metrics_plot)
plt.title("Model Comparison Bar Chart")
plt.ylabel("Score")
plt.savefig("plots/Model_Comparison_Bar.png")
plt.close()

# ===============================
# مرحله 8: دانلود همه فایل‌ها
# ===============================
files.download("model_results.csv")
files.download("model_results.xlsx")
files.download("model_results.txt")
files.download("plots/Model_Comparison_Line.png")
files.download("plots/Model_Comparison_Bar.png")

# دانلود Confusion Matrix ها
for file in os.listdir("plots"):
    if "ConfusionMatrix" in file:
        files.download(os.path.join("plots", file))

print("All results and plots are saved and ready for download.")
