In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
# Сброс ограничений на количество выводимых рядов
pd.set_option('display.max_rows', 50)
 
# Сброс ограничений на число столбцов
pd.set_option('display.max_columns', None)
 
# Сброс ограничений на количество символов в записи
pd.set_option('display.max_colwidth', None)

In [3]:
# diamonds_df = sns.load_dataset("diamonds")
# diamonds_df.to_csv('csv/diamonds.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [4]:
diamonds_df = pd.read_csv('csv/diamonds.csv', encoding='utf-8', header=0, sep='\t', decimal='.')
diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [5]:
# удаление столбцов, которые содержат подстроку "id" в своих именах и сохранение в переменную оставшихся
diamonds_df = diamonds_df.drop(columns=diamonds_df.filter(like='id').columns)

In [6]:
print("columns: ", diamonds_df.columns.tolist(), '\n')

print("count columns: ", diamonds_df.columns.shape[0], '\n')

print("count empty columns: ")
sum_null_diamonds_series = diamonds_df.isnull().sum()
print(sum_null_diamonds_series[sum_null_diamonds_series > 0].astype(float) / len(diamonds_df) * 100, '\n')

print("Type columns:")
print(diamonds_df.dtypes, '\n')


print("Describe:")
print(diamonds_df.describe())

columns:  ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z'] 

count columns:  10 

count empty columns: 
Series([], dtype: float64) 

Type columns:
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object 

Describe:
              carat         depth         table         price             x  \
count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   
mean       0.797940     61.749405     57.457184   3932.799722      5.731157   
std        0.474011      1.432621      2.234491   3989.439738      1.121761   
min        0.200000     43.000000     43.000000    326.000000      0.000000   
25%        0.400000     61.000000     56.000000    950.000000      4.710000   
50%        0.700000     61.800000     57.000000   2401.000000      5.700000   
75%        1.040000     62.500000     59.000000   5324

In [9]:
object_columns = diamonds_df.select_dtypes(include=[object]).columns.tolist()
print("object columns:", object_columns, '\n')
print("count object columns:", len(object_columns), '\n')
diamonds_objects_df = diamonds_df[object_columns]
print(diamonds_objects_df.head(), '\n')
dummies_df = pd.get_dummies(diamonds_objects_df)
# dummies_df
# pd.concat([diamonds_df, dummies_df], axis=1)
# (len(dummies_df), len(diamonds_df), len(pd.concat([diamonds_df, dummies_df], axis=1)))
diamonds_dummies_df = diamonds_df.drop(object_columns, axis=1)
diamonds_dummies_df = pd.concat([diamonds_dummies_df, dummies_df], axis=1)
diamonds_dummies_df

object columns: ['cut', 'color', 'clarity'] 

count object columns: 3 

       cut color clarity
0    Ideal     E     SI2
1  Premium     E     SI1
2     Good     E     VS1
3  Premium     I     VS2
4     Good     J     SI2 



Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False
1,0.21,59.8,61.0,326,3.89,3.84,2.31,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
2,0.23,56.9,65.0,327,4.05,4.07,2.31,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False
3,0.29,62.4,58.0,334,4.20,4.23,2.63,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
4,0.31,63.3,58.0,335,4.34,4.35,2.75,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False


In [10]:
numeric_columns = diamonds_df.select_dtypes(include=[np.number]).columns.tolist()
print("numeric columns:", numeric_columns, '\n')
print("count numeric columns:", len(numeric_columns), '\n')
diamonds_numeric_df = diamonds_df[numeric_columns]
diamonds_numeric_df

numeric columns: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z'] 

count numeric columns: 7 



Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74


In [None]:
# создание объекта класса "SimpleImputer" для обработки пропущенных значений
imputer = SimpleImputer(strategy='median') # mean

# заполнение пропущенных значений в датафрейме с использованием стратегии, указанной при создании обьъекта imputer
imputed_df = imputer.fit_transform(diamonds_numeric_df)

# преобразование массива в датафрейм
imputed_df = pd.DataFrame(imputed_df)

# установка названий столбцов нового датафрейма равными названиям столбцов исходного датафрейма data
imputed_df.columns = diamonds_numeric_df.columns

#
imputed_df

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,62.4,58.0,334.0,4.20,4.23,2.63
4,0.31,63.3,58.0,335.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757.0,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757.0,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757.0,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757.0,6.15,6.12,3.74


In [35]:
diamonds_numeric_df.corr()

Unnamed: 0,carat,depth,table,price,x,y,z
carat,1.0,0.028224,0.181618,0.921591,0.975094,0.951722,0.953387
depth,0.028224,1.0,-0.295779,-0.010647,-0.025289,-0.029341,0.094924
table,0.181618,-0.295779,1.0,0.127134,0.195344,0.18376,0.150929
price,0.921591,-0.010647,0.127134,1.0,0.884435,0.865421,0.861249
x,0.975094,-0.025289,0.195344,0.884435,1.0,0.974701,0.970772
y,0.951722,-0.029341,0.18376,0.865421,0.974701,1.0,0.952006
z,0.953387,0.094924,0.150929,0.861249,0.970772,0.952006,1.0


In [38]:

diamonds_numeric_sq_df = diamonds_numeric_df.copy()
for col_name in numeric_columns:
    diamonds_numeric_sq_df[f'{col_name}_sq'] = diamonds_numeric_sq_df[col_name] ** 2
    diamonds_numeric_sq_df[f'{col_name}_tr'] = diamonds_numeric_sq_df[col_name] ** 3
diamonds_numeric_sq_df.corr()

Unnamed: 0,carat,depth,table,price,x,y,z,carat_sq,carat_tr,depth_sq,depth_tr,table_sq,table_tr,price_sq,price_tr,x_sq,x_tr,y_sq,y_tr,z_sq,z_tr
carat,1.0,0.028224,0.181618,0.921591,0.975094,0.951722,0.953387,0.953474,0.825274,0.031107,0.033948,0.17977,0.177437,0.807126,0.704571,0.991181,0.996059,0.655279,0.147243,0.754579,0.204044
depth,0.028224,1.0,-0.295779,-0.010647,-0.025289,-0.029341,0.094924,0.025529,0.02518,0.999479,0.997924,-0.296933,-0.297268,-0.019473,-0.0222,-0.028565,-0.031862,-0.026839,-0.013139,0.074832,0.020587
table,0.181618,-0.295779,1.0,0.127134,0.195344,0.18376,0.150929,0.154086,0.119243,-0.28905,-0.281774,0.999237,0.996447,0.092152,0.07309,0.192851,0.187764,0.12058,0.023442,0.112777,0.025219
price,0.921591,-0.010647,0.127134,1.0,0.884435,0.865421,0.861249,0.890938,0.764552,-0.009758,-0.008855,0.125173,0.122877,0.944526,0.859469,0.908568,0.921358,0.601827,0.135579,0.688294,0.18783
x,0.975094,-0.025289,0.195344,0.884435,1.0,0.974701,0.970772,0.873879,0.712795,-0.022265,-0.019208,0.19357,0.191271,0.728086,0.61212,0.994531,0.97894,0.655854,0.143092,0.751198,0.198931
y,0.951722,-0.029341,0.18376,0.865421,0.974701,1.0,0.952006,0.852851,0.69539,-0.026591,-0.023804,0.18197,0.179682,0.712859,0.599352,0.969595,0.954443,0.800447,0.342544,0.739177,0.196887
z,0.953387,0.094924,0.150929,0.861249,0.970772,0.952006,1.0,0.853819,0.696581,0.097865,0.100677,0.148989,0.146637,0.706803,0.592724,0.964501,0.948381,0.652772,0.158974,0.860516,0.369027
carat_sq,0.953474,0.025529,0.154086,0.890938,0.873879,0.852851,0.853819,1.0,0.950823,0.028049,0.030541,0.152578,0.150676,0.850806,0.782209,0.915759,0.949296,0.606676,0.142461,0.697182,0.19467
carat_tr,0.825274,0.02518,0.119243,0.764552,0.712795,0.69539,0.696581,0.950823,1.0,0.027355,0.029507,0.118204,0.116875,0.783003,0.750836,0.768532,0.820313,0.509679,0.124235,0.586109,0.168976
depth_sq,0.031107,0.999479,-0.28905,-0.009758,-0.022265,-0.026591,0.097865,0.028049,0.027355,1.0,0.999479,-0.290039,-0.290214,-0.019104,-0.021968,-0.02564,-0.029062,-0.024986,-0.012653,0.077093,0.021153
