In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split # Для разделения данных на обучающую и тестовую выборку
from sklearn.feature_extraction.text import TfidfVectorizer # для преобразования текста в вектор
from sklearn.linear_model import LogisticRegression # использование модели логистической регрессии
from sklearn.metrics import accuracy_score, classification_report # оценка производительности модели
from sklearn.pipeline import Pipeline # конвеер обработки данных

In [4]:
df=pd.read_csv('cleaned_house_prices.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,SaleCondition,SalePrice,HouseStyle_Encoded,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,Age
0,1,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,...,Normal,208500,5,False,True,False,False,False,False,21
1,2,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,...,Normal,181500,2,False,True,False,False,False,False,48
2,3,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,...,Normal,223500,5,False,True,False,False,False,False,23
3,4,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,...,Abnorml,140000,5,False,True,False,False,False,False,109
4,5,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,...,Normal,250000,5,False,True,False,False,False,False,24


In [6]:
small_df = df.sample(frac = 0.01)
small_df.to_csv('small_set.csv', index=False)

In [7]:
big_df = df.drop(small_df.index)

In [12]:
big_df['SalePrice'].describe()

count      1412.000000
mean     174978.892351
std       66725.388986
min       34900.000000
25%      129500.000000
50%      160100.000000
75%      207700.000000
max      415298.000000
Name: SalePrice, dtype: float64

In [14]:
big_df.loc[big_df['SalePrice']>200000,'Expensiveness']='Expensive'
big_df.loc[big_df['SalePrice']<100000,'Expensiveness']='Chip'
big_df.loc[(big_df['SalePrice']>=100000) & (big_df['SalePrice']<=200000),'Expensiveness']='Medium'
big_df[['SalePrice','Expensiveness']]

Unnamed: 0,SalePrice,Expensiveness
0,208500,Expensive
1,181500,Medium
2,223500,Expensive
3,140000,Medium
4,250000,Expensive
...,...,...
1421,175000,Medium
1422,210000,Expensive
1423,266500,Expensive
1424,142125,Medium


In [22]:
small_df_marked=pd.read_csv('small_set_marked.csv')
small_df_marked = small_df_marked.rename(columns={'sentiment': 'Expensiveness'})
small_df_marked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 95 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   1stFlrSF            14 non-null     int64  
 1   2ndFlrSF            14 non-null     int64  
 2   3SsnPorch           14 non-null     int64  
 3   Age                 14 non-null     int64  
 4   Alley               14 non-null     object 
 5   BedroomAbvGr        14 non-null     int64  
 6   BldgType            14 non-null     object 
 7   BsmtCond            14 non-null     object 
 8   BsmtExposure        14 non-null     object 
 9   BsmtFinSF1          14 non-null     int64  
 10  BsmtFinSF2          14 non-null     int64  
 11  BsmtFinType1        14 non-null     object 
 12  BsmtFinType2        14 non-null     object 
 13  BsmtFullBath        14 non-null     int64  
 14  BsmtHalfBath        14 non-null     int64  
 15  BsmtQual            14 non-null     object 
 16  BsmtUnfSF 

In [21]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1412 entries, 0 to 1425
Data columns (total 89 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  1412 non-null   int64  
 1   MSSubClass          1412 non-null   int64  
 2   MSZoning            1412 non-null   object 
 3   LotFrontage         1412 non-null   float64
 4   LotArea             1412 non-null   int64  
 5   Street              1412 non-null   object 
 6   Alley               1412 non-null   object 
 7   LotShape            1412 non-null   object 
 8   LandContour         1412 non-null   object 
 9   Utilities           1412 non-null   object 
 10  LotConfig           1412 non-null   object 
 11  LandSlope           1412 non-null   object 
 12  Neighborhood        1412 non-null   object 
 13  Condition1          1412 non-null   object 
 14  Condition2          1412 non-null   object 
 15  BldgType            1412 non-null   object 
 16  HouseStyle 

In [25]:
full_df=pd.concat([small_df_marked, big_df], ignore_index=True)
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426 entries, 0 to 1425
Data columns (total 95 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   1stFlrSF            1426 non-null   int64  
 1   2ndFlrSF            1426 non-null   int64  
 2   3SsnPorch           1426 non-null   int64  
 3   Age                 1426 non-null   int64  
 4   Alley               1426 non-null   object 
 5   BedroomAbvGr        1426 non-null   int64  
 6   BldgType            1426 non-null   object 
 7   BsmtCond            1426 non-null   object 
 8   BsmtExposure        1426 non-null   object 
 9   BsmtFinSF1          1426 non-null   int64  
 10  BsmtFinSF2          1426 non-null   int64  
 11  BsmtFinType1        1426 non-null   object 
 12  BsmtFinType2        1426 non-null   object 
 13  BsmtFullBath        1426 non-null   int64  
 14  BsmtHalfBath        1426 non-null   int64  
 15  BsmtQual            1426 non-null   object 
 16  BsmtUn

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
full_df['LotShape'], # текст рецензии - будет использоваться как входные данные
full_df['Expensiveness'], # метка классов (положительный или отрицательный )
test_size=0.2, # доля данных которая попадет в тест (20%)
random_state=42 # Зерно генератора случайных чисел для воспроизводимости результатов
)

In [31]:
pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english')), # векторизация текста, с исключением стоп слов англ языка
('clf', LogisticRegression(max_iter=1000)) # модель логистической регресии с увелечинным колличеством итераций до 1000
])

In [32]:
#Обучение модели на обучающем наборе данных
pipeline.fit(X_train, y_train)

In [33]:
predictions = pipeline.predict(X_test) # предсказание модели на тестовых данных
accuracy = accuracy_score(y_test, predictions) # расчет точности модели
report = classification_report(y_test, predictions) # оценка качесвта предсказания

print(f"Точность модели {accuracy}")
print('Отчет по классификации')
print(report)

Точность модели 0.6853146853146853
Отчет по классификации
              precision    recall  f1-score   support

        Chip       0.00      0.00      0.00        21
   Expensive       0.00      0.00      0.00        69
      Medium       0.69      1.00      0.81       196

    accuracy                           0.69       286
   macro avg       0.23      0.33      0.27       286
weighted avg       0.47      0.69      0.56       286



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
