In [None]:
import numpy as np
import pandas as pd

In [None]:
import MySQLdb

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings
warnings.simplefilter("ignore")

### Input Data

In [None]:
def get_data(filename):
    #change for sharing
    with open("D:/github_projects/login.txt") as file:
        myserver_config = file.read().split(",") #config data for SQL server
        connect = MySQLdb.connect(*myserver_config)
        cursor = connect.cursor()
        query = f"SELECT * FROM {filename}"
        data = pd.read_sql(query, connect, index_col="id")
    return data

In [None]:
def change_types(mydata):
    mydata = mydata.replace("None", np.nan)
    for elem in ["Density", "Х", "У", "Scanning_spot"]:
        mydata[elem] = mydata[elem].str.replace(",", ".")
        mydata[elem] = mydata[elem].astype(np.float32)
    mydata["Lazer"] = mydata["Lazer"].astype(np.object_)
    #remove thickness due to its various
    mydata.drop("Thickness", inplace=True, axis=1)
    return mydata[mydata["Density"] < 100]


In [None]:
data = get_data("titanium_pure")

In [None]:
data = change_types(data)

### Data Preprocessing

Main data description

In [None]:
class DataPreprocessing:
    def __init__(self, df):
        self.__data = df
        self.numerical_cols = df.select_dtypes(["int", "float"]).columns
        self.categorical_cols = df.select_dtypes(["object"]).columns
    
    def Main_info(self):
        print(self.__data.head())
        print(self.__data.info())
        print(self.__data.describe())    

    def Numerical_review(self):
            
        print(*self.numerical_cols, sep=", ")
        for col in self.numerical_cols:
            sns.histplot(self.__data[col], kde=True)
            plt.title(f"Distribution of {col}")
            plt.show();

    def Emmisions_review(self):

        print(*self.numerical_cols, sep=", ")
        for col in self.numerical_cols:
            sns.boxplot(self.__data[col])
            plt.title(f"Boxplot of {col}")
            plt.show();

    def Remove_emmisions(self, boarders=3):

        for col in self.numerical_cols:
            standart_deviasion = self.__data[col].std()
            self.__data = self.__data[(self.__data[col] < boarders*standart_deviasion) & (self.__data[col] > -boarders*standart_deviasion)]

        self.Emmisions_review()
        return self.__data

    def Correlation_review(self):

        corr_matrix = self.__data.select_dtypes(["int", "float"]).corr()
        sns.heatmap(data=corr_matrix, annot=True, cmap="coolwarm");

    def Categorical_review(self):
        
        print(*self.categorical_cols, sep=", ")
        for col in self.categorical_cols:
            sns.countplot(x = col, data=self.__data)
            plt.title(f"Countplot of {col}")
            plt.show();

    def One_hot_encoding(self):
        self.__data = pd.get_dummies(data=self.__data, columns=self.categorical_cols, drop_first=True)
        return self.__data

    def Standartization(self):
        columns = list(self.numerical_cols)
        columns.remove("Density")
        scalar = StandardScaler()
        self.__data[columns] = scalar.fit_transform(self.X[columns])
        return self.X


    @property
    def X(self):
        name = "Density"
        if name in self.__data.columns:
            return self.__data.drop("Density", axis=1)
        return self.__data
    
    @property
    def y(self):
        return self.__data["Density"]

In [None]:
instance = DataPreprocessing(data)

### Main data info

In [None]:
instance.Main_info()

### Numerical review

In [None]:
instance.Numerical_review()

### Emission in Data

In [None]:
instance.Emmisions_review()

### Correlation review

In [None]:
instance.Correlation_review()

### Categorical data review

In [None]:
instance.Categorical_review()

### One Hot Encoding for Categorical fields

In [None]:
instance.One_hot_encoding()

### Standartization

In [None]:
instance.Standartization()

### Work with values

In [None]:
X = DataPreprocessing(instance.X)

In [None]:
X.Emmisions_review()

In [None]:
X.Remove_emmisions()

### PCA analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
class PCA_analysis:
    def __init__(self, data):
        self.__data = data
        
    def var_visualisation(self, var_threshold=0.9):
        max_principal = len(data.columns)
        principal = PCA(n_components=max_principal).fit(self.__data)
        
        x = []
        y = []
        for i in range(1, max_principal):
            y.append(sum(principal.explained_variance_ratio_[:i]))
            x.append(i)

        sns.barplot(x=x, y=y)
        sns.lineplot(y=var_threshold, x=plt.xlim(), c="r", label=f"threshold = {var_threshold}")
        plt.title("Explained Variance Ratio")
        plt.xlabel("Nums of components")
        plt.ylabel("Ratio")
        plt.legend()
        plt.grid()
        plt.show()

    def resolve_optimal_components(self, var_threshold=0.9):
        pca = None
        for i in range(1, len(data.columns) + 1):
            principal = PCA(n_components=i).fit(self.__data)
            if sum(principal.explained_variance_ratio_) >= var_threshold:
                pca = principal
                break
                
        components_df = pd.DataFrame(pca.components_, columns=self.__data.columns, index=[f"PC{k}" for k in range(1, i+1)])
        print("PCA:\n", components_df)

        return principal.transform
    
            
        

In [None]:
instance = PCA_analysis(X.X)

In [None]:
instance.var_visualisation()

In [None]:
X = instance.resolve_optimal_components()(X.X)

In [None]:
X

### Machine Learning