In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df =pd.read_csv("data.csv")
df

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.drop("Unnamed: 32", axis=1, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['diagnosis'].value_counts().plot(kind='bar', color=['blue', 'orange'])


In [None]:
df.columns

In [None]:
df[['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']].nunique()

In [None]:
plt.figure(figsize=(18, 14))
numeric_df = df.select_dtypes(include=[np.number]) 
sns.heatmap(numeric_df.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
sns.pairplot(df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'diagnosis']])
plt.show()

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
plt.figure(figsize=(15, 8))
for i, col in enumerate(features):
    plt.subplot(2, 3, i+1)
    sns.boxplot(x='diagnosis', y=col, data=df)
    plt.title(f'{col} vs diagnosis')
plt.tight_layout()
plt.show()

In [None]:
for col in features:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=df, x=col, hue='diagnosis', kde=True, element="step")
    plt.title(f'Distribution of {col} by diagnosis')
    plt.show()

In [None]:
sns.pairplot(df[features + ['diagnosis']], hue='diagnosis')
plt.show()

In [None]:
float_cols = [col for col in df.columns if df[col].dtype == 'float64']

plt.figure(figsize=(18, 3 * len(float_cols)))
for i, col in enumerate(float_cols):
    plt.subplot(len(float_cols)//3 + 1, 3, i+1)
    sns.boxplot(x='diagnosis', y=col, data=df)
    plt.title(f'{col} vs diagnosis')
plt.tight_layout()
plt.show()

In [None]:
float_cols = [col for col in df.columns if df[col].dtype == 'float64']
pairs = list(combinations(float_cols, 2))

n_cols = 5
n_rows = (len(pairs) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
axes = axes.flatten()

for i, (x_col, y_col) in enumerate(pairs):
    sns.scatterplot(data=df, x=x_col, y=y_col, hue='diagnosis', palette='Set1', alpha=0.7, ax=axes[i])
    axes[i].set_title(f'{x_col} vs {y_col}')
    axes[i].legend().set_title('diagnosis')
    axes[i].set_xlabel(x_col)
    axes[i].set_ylabel(y_col)
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
X = df[[col for col in df.columns if col not in ['id', 'diagnosis']]]
y = df['diagnosis'].map({'M': 1, 'B': 0})  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("الدقة:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

with open("model_results.txt", "w", encoding="utf-8") as f:
    f.write(f"الدقة: {accuracy}\n")
    f.write(report)
print("model_results.txt")

In [None]:
import joblib
joblib.dump(clf, "tree_model.pkl")

In [None]:
import pkg_resources

packages = [
    "streamlit",
    "pandas",
    "scikit-learn",
    "joblib",
    "altair",
    "numpy",
    "matplotlib",
    "seaborn",
    "itertools",
    "sklearn.metrics",  
    "sklearn.model_selection",
    "sklearn.tree",
]

with open("requirements.txt", "w") as f:
    for package in packages:
        try:
            version = pkg_resources.get_distribution(package).version
            f.write(f"{package}=={version}\n")
        except pkg_resources.DistributionNotFound:
            print(f"[!] Package '{package}' not found. Skipping.")