In [5]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns

import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

%matplotlib inline
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report, mean_squared_error, confusion_matrix

import tensorflow 
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import  Flatten, Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam, SGD, RMSprop

# from tensorflow.keras import utils
# from tensorflow.keras.utils import to_categorical

np.random.seed(45) # чтобы выборки данных всегда были одни и теже берем любую цифру . мне нравится 45!

In [6]:
data_main = pd.read_csv(r'C:\Users\grain\Work_folder\Diplom_MGTU\Diploma2_inspect\data_main.csv')

In [7]:
data_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pattern_angle                 948 non-null    float64
 1   step_strip                    948 non-null    float64
 2   density_strip                 948 non-null    float64
 3   ratio_filler_matrix           948 non-null    float64
 4   density                       948 non-null    float64
 5   elasticity_module             948 non-null    float64
 6   number_hardeners              948 non-null    float64
 7   content_epoxy_groups          948 non-null    float64
 8   flash_temperature             948 non-null    float64
 9   surface_density               948 non-null    float64
 10  elasticity_module_stretching  948 non-null    float64
 11  strapery_strength             948 non-null    float64
 12  resin_consumption             948 non-null    float64
dtypes: fl

In [11]:
# (RU) Функция оптимизации типов в dataframe.

# !!!! Код взят из источника https://github.com/ellavs/python-pandas-optimize-dataframe-memory-usage
# Function optimizes memory usage in dataframe.

def optimize_memory_usage(df, print_size=True):

# Types for optimization.
    # Типы, которые будем проверять на оптимизацию.
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    # Memory usage size before optimize (Mb).
    # (RU) Размер занимаемой памяти до оптимизации (в Мб).
    before_size = df.memory_usage().sum() / 1024**2    
    for column in df.columns:
        column_type = df[column].dtypes
        if column_type in numerics:
            column_min = df[column].min()
            column_max = df[column].max()
            if str(column_type).startswith('int'):
                if column_min > np.iinfo(np.int32).min and column_max < np.iinfo(np.int32).max:
                    df[column] = df[column].astype(np.int32)
                elif column_min > np.iinfo(np.int64).min and column_max < np.iinfo(np.int64).max:
                    df[column] = df[column].astype(np.int64)  
            else:
                if column_min > np.finfo(np.float32).min and column_max < np.finfo(np.float32).max:
                    df[column] = df[column].astype(np.float32)
                else:
                    df[column] = df[column].astype(np.float64)    
    # Memory usage size after optimize (Mb).
    # (RU) Размер занимаемой памяти после оптимизации (в Мб).
    after_size = df.memory_usage().sum() / 1024**2
    if print_size: print('Memory usage size: before {:5.4f} Mb - after {:5.4f} Mb ({:.1f}%).'.format(before_size, after_size, 100 * (before_size - after_size) / before_size))
    return df

def import_data_from_csv(df):
    # Show dataframe info before optimize.
    # (RU) Показать информацию о таблице до оптимизации.
    print('-' * 80)
    print(df.info())
    print('-' * 80)
    # (RU) Оптимизация типов в dataframe.
    df = optimize_memory_usage(df) # используем функцию optimize_memory_usage , созданую выше
    # Show dataframe info after optimize.
    # (RU) Показать информацию о таблице после оптимизации.
    print('-' * 80)
    print(df.info())
    print('-' * 80)
    return df

In [12]:
# Вызываем функцию import_data_from_csv для уменьшения размерности переменных в файле df_bp_new

data_main32 = import_data_from_csv(data_main)

--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pattern_angle                 948 non-null    float64
 1   step_strip                    948 non-null    float64
 2   density_strip                 948 non-null    float64
 3   ratio_filler_matrix           948 non-null    float64
 4   density                       948 non-null    float64
 5   elasticity_module             948 non-null    float64
 6   number_hardeners              948 non-null    float64
 7   content_epoxy_groups          948 non-null    float64
 8   flash_temperature             948 non-null    float64
 9   surface_density               948 non-null    float64
 10  elasticity_module_stretching  948 non-null    float64
 11  strapery_strength             948 non-null

In [13]:
# Функция расчта нового признака ALFA
# df - Передаем в функцию dataframe
# data_main_add_col - возвращаем df- с новым добавленым столбцом ALFA,  
# а также удалим два столбца:  'strapery_strength' и 'elasticity_module_stretching'

def add_column_alfa(df):
    df_add_col = df.assign(alfa = df.strapery_strength /df.elasticity_module_stretching)
    # df_add_col =  df_add_col.drop(['strapery_strength', 'elasticity_module_stretching'], axis=1)
    return(df_add_col)

In [14]:
df_add_alfa = add_column_alfa(data_main32)

In [15]:
df_add_alfa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pattern_angle                 948 non-null    float32
 1   step_strip                    948 non-null    float32
 2   density_strip                 948 non-null    float32
 3   ratio_filler_matrix           948 non-null    float32
 4   density                       948 non-null    float32
 5   elasticity_module             948 non-null    float32
 6   number_hardeners              948 non-null    float32
 7   content_epoxy_groups          948 non-null    float32
 8   flash_temperature             948 non-null    float32
 9   surface_density               948 non-null    float32
 10  elasticity_module_stretching  948 non-null    float32
 11  strapery_strength             948 non-null    float32
 12  resin_consumption             948 non-null    float32
 13  alfa 

In [19]:
from sklearn import decomposition
from sklearn import linear_model
from sklearn.pipeline import Pipeline

Собираем датасет

In [20]:
y1 = df_add_alfa[["elasticity_module_stretching"]]
y2 = df_add_alfa[["strapery_strength"]]
X = df_add_alfa.drop(columns=["elasticity_module_stretching",
                     "strapery_strength"]
                    )

In [None]:
# Проведем дальнейшие действия для  целевой переменной "elasticity_module_stretching"
# Для целевой переменной "strapery_strength" действия будут аналогичными

In [None]:
# Разделяем датасет на тестовые, проверочные и обучающие выборки за 2 приема

In [None]:
# Разделяем датасет на тестовы (test) и 1-ю обучающие выборки
# Разбиваем X на X_train_full и X_test_full в пропорции 80/20, а y  на  y_train_full и y_test_full

In [29]:
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y1, 
                                                    train_size=0.8, 
                                                    random_state=45
                                                    )

# X_test_full и ytest_full будут использованы в конце при проверке нашей моддели
## НУЖНО из записать в папку!!!

In [30]:
X_test_full.shape

(190, 12)

In [31]:
y_test_full.shape

(190, 1)

In [32]:
X_train_full.shape

(758, 12)

In [33]:
y_train_full.shape

(758, 1)

In [None]:
# Разделяем датасет на проверочную (validation) и 2-ю обучающую выборки

In [34]:
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train_full_1, y_train_full_1, 
                                                    train_size=0.8, 
                                                    random_state=45
                                                    )

In [None]:
# X_val_1 и y_val_1 будут использованы при проверке работоспособности нашей модели после обучения 

In [35]:
X_train_1.shape

(606, 12)

In [36]:
X_val_1.shape

(152, 12)

In [38]:
y_train_1.shape

(606, 1)

In [37]:
y_val_1.shape

(152, 1)

# Взято из
https://www.projectpro.io/recipes/create-and-optimize-baseline-linear-regression-model

In [39]:
# Creating an scaler object
std_scl = StandardScaler()

# Creating a pca object
pca = decomposition.PCA()

# Creating a linear regression object with an L2 penalty
linear = linear_model.LinearRegression()

# Creating a pipeline of three steps. First, standardize the data.
# Second, tranform the data with PCA.
# Third, train a Decision Tree Classifier on the data.
pipe = Pipeline(steps=[('std_scl', std_scl),
                       ('pca', pca),
                       ('linear', linear)])

# Creating Parameter Space
# Creating a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
n_components = list(range(1, X_train_1.shape[1]+1,1))

# Creating lists of parameter for Linear Regression
normalize = [True, False]

# Creating a dictionary of all the parameter options 
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(pca__n_components=n_components,
                  linear__normalize=normalize)

# Conducting Parameter Optmization With Pipeline
# Creating a grid search object
clf = GridSearchCV(pipe, parameters)

# Fitting the grid search
clf.fit(X_train_1,y_train_1)

# Viewing The Best Parameters
print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print(); print(clf.best_estimator_.get_params()['linear'])

Best Number Of Components: 4

LinearRegression(normalize=True)


In [None]:
# Взято из 
https://python.hotexamples.com/ru/examples/sklearn.linear_model/LinearRegression/get_params/python-linearregression-get_params-method-examples.html

In [149]:
def simple_linear(X_train, y_train, X_test, y_test):
    linear = LinearRegression()
    linear.fit(X_train, y_train)
    y_pred = linear.predict(X_test)
    print('\nLinear Regression Summary:')
    print()
    print('R2:', linear.score(X_test, y_test))
    print()
    print('Intercept:', linear.intercept_)
    print('\nCoefficients:', linear.coef_)
    print('DataFrame:', pd.DataFrame(model.coef_, X_train.columns))
    print()
    print('Parameters:', linear.get_params())
    print()
    '''Predict how well model will perfom on test data'''
    score = cross_val_score(estimator=linear,
                            X=X_train,
                            y=y_train,
                            fit_params=None,
                            scoring='r2',
                            cv=3,
                            n_jobs=-1)
    print('Mean Cross Validation Score:', score.mean())

In [150]:
simple_linear(X_train, y_train, X_val, y_val)


Linear Regression Summary:

R2: -0.019389360805280864

Intercept: 0.53328943

Coefficients: [-0.06732082  0.01082223 -0.06388463]
DataFrame:                          0
density          -0.067321
number_hardeners  0.010822
surface_density  -0.063885

Parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}

Mean Cross Validation Score: -0.008429587943518224
