In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

file_path = 'SanPhamMeVaBe.csv'
data = pd.read_csv(file_path)

# rename columns
data_cleaned = data.rename(columns={'Product Name': 'ProductName', 'Product Price': 'ProductPrice', 'Catergory': 'Category'})

# split data (80% train, 20% test)
X = data_cleaned[['ProductName', 'ProductPrice']]
y = data_cleaned['Category']

X_train, X_test, y_train, y_test = train_test_split(X['ProductName'], y, test_size=0.2, random_state=42)

# create pipeline
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100, random_state=42))

# train model
model.fit(X_train, y_train)

# test model
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


KeyboardInterrupt: 

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
file_path = 'SanPhamMeVaBe.csv'
data = pd.read_csv(file_path)

In [None]:
# rename columns
data_cleaned = data.rename(columns={'Product Name': 'ProductName', 'Product Price': 'ProductPrice', 'Catergory': 'Category'})

In [None]:
# split data (80% train, 20% test)
X = data_cleaned['ProductName'].astype(str)
y = data_cleaned['Category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# tokenization and padding
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=50, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=50, padding='post')

In [None]:
# create model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(50000, 64),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])
model.build(input_shape=(None, 50))
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
# complie model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
model.fit(X_train_padded, y_train, epochs=50, validation_data=(X_test_padded, y_test))

In [None]:

# evaluate
loss, accuracy = model.evaluate(X_test_padded, y_test)

print(f"Test Accuracy: {accuracy:.4f}")

In [117]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [118]:
data = pd.read_csv('DanCu.csv')

# change data type
data['Diện tích(km2)'] = data['Diện tích(km2)'].str.replace(',', '.').astype(float)
data['Mật độ dân số (Người/km2)'] = data['Mật độ dân số (Người/km2)'].str.replace(',', '.').astype(float)

# choose features and target
features = data[['Năm', 'Diện tích(km2)', 'Tổng_số_cặp_kết_hôn', 'Mật độ dân số (Người/km2)']]
target = data['Tỷ_lệ_sinh']

In [119]:
# split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [120]:
# create model
model = LinearRegression()

# train model
model.fit(X_train, y_train)

In [121]:
# test model
y_pred = model.predict(X_test)

# evaluate
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.009157418126681083
R^2 Score: -0.34174624566755996


In [122]:
from sklearn.ensemble import RandomForestRegressor

# new model
model = RandomForestRegressor(random_state=42)

# train model
model.fit(X_train, y_train)

# test and evaluate again
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.009598460000000166
R^2 Score: -0.40636776556779197


In [None]:
from sklearn.model_selection import GridSearchCV

# new model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=64, n_jobs= 5, verbose=5, n_estimators=100), param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# evaluate again part 2
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Optimized Mean Squared Error: {mse}')
print(f'Optimized R^2 Score: {r2}')

In [123]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0023691224774431037
R^2 Score: 0.6528758274808635


In [124]:
from sklearn.model_selection import RandomizedSearchCV

# new model
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    # 'learning_rate': [0.01, 0.1, 0.2, 0.02]
}

random_search = RandomizedSearchCV(XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=15, cv=5, verbose=10)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

# evaluate again part 3
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Optimized Mean Squared Error: {mse}')
print(f'Optimized R^2 Score: {r2}')

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5; 1/15] START max_depth=30, n_estimators=100.............................
[CV 1/5; 1/15] END max_depth=30, n_estimators=100;, score=0.398 total time=   0.0s
[CV 2/5; 1/15] START max_depth=30, n_estimators=100.............................
[CV 2/5; 1/15] END max_depth=30, n_estimators=100;, score=0.503 total time=   0.0s
[CV 3/5; 1/15] START max_depth=30, n_estimators=100.............................
[CV 3/5; 1/15] END max_depth=30, n_estimators=100;, score=-3.312 total time=   0.0s
[CV 4/5; 1/15] START max_depth=30, n_estimators=100.............................
[CV 4/5; 1/15] END max_depth=30, n_estimators=100;, score=-22.318 total time=   0.0s
[CV 5/5; 1/15] START max_depth=30, n_estimators=100.............................
[CV 5/5; 1/15] END max_depth=30, n_estimators=100;, score=-0.307 total time=   0.0s
[CV 1/5; 2/15] START max_depth=10, n_estimators=150.............................
[CV 1/5; 2/15] END max_depth=10, n

In [125]:
import pandas as pd

data = pd.read_csv("place_info_final.csv")

In [126]:
# Define function to calculate weight
def calculate_weight(row):
    # Assign weights for each factor
    rating_weight = 0.2
    location_weight = 0.25
    hours_weight = 0.1
    chain_weight = 0.1
    district_weight = 0.1
    
    # Calculate score based on rating   
    rating_score = (row['rating'] / 5) * rating_weight
    
    # Assuming location and district scores are optimal for simplicity
    location_score = location_weight
    district_score = district_weight
    
    # Calculate score based on opening hours (assuming 14 hours is the max)
    opening_hours = 14  # Example: 8 AM to 10 PM
    hours_score = (opening_hours / 14) * hours_weight
    
    # Assuming chain score is optimal
    chain_score = chain_weight
    
    # Total score
    total_score = rating_score + location_score + hours_score + chain_score + district_score
    return total_score

In [127]:
# Apply function to calculate weight
data['weight'] = data.apply(calculate_weight, axis=1)

# sort by weight
data[['name', 'weight']]

Unnamed: 0,name,weight
0,Violet Pham,0.722
1,OVmart - Lê Trọng Tấn,0.726
2,Mẹ và Bé,0.750
3,Baby Kid,0.670
4,Nana's House - Siêu thị Mẹ và Bé,0.706
...,...,...
139,KidsPlaza Store Support Office,0.710
140,KidsPlaza - Combo đồ sơ sinh mùa hè bé trai bé...,0.746
141,KidsPlaza Kiot 4 - 6 tầng 1 HH03A Khu ĐT Thanh...,0.750
142,KidsPlaza,0.750


In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import numpy as np
import random

In [129]:
data = pd.read_csv("SanPhamMeVaBe.csv")

In [130]:
# example of creating a column
# create a column with random values
data['Sales'] = [random.randint(50, 1500) for _ in range(len(data))]

In [131]:
# split data (80% train, 20% test)
X = data[['Product Price', 'Age Usage', 'Country', 'Trademarks']]
y = data['Sales']
product_names = data['Product Name']

In [138]:
# create encryption pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Age Usage', 'Country', 'Trademarks'])
    ], remainder='passthrough', verbose_feature_names_out=False)

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())], verbose=True) # DON'T

In [147]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor(n_estimators=100, verbose=10, n_jobs=5))], verbose=True)

In [148]:
# train test split
X_train, X_test, y_train, y_test, product_train, product_test = train_test_split(
    X, y, product_names, test_size=0.2, random_state=64
)

In [149]:
# train model
pipeline.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100


[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    0.1s


building tree 6 of 100building tree 7 of 100

building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    0.3s


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    0.5s


building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100


[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    0.9s


building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    1.3s


building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    1.5s


building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100


[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:    2.0s


building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100


[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:    2.5s


building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100


[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:    3.0s


building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100


[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:    3.5s


building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
[Pipeline] ............. (step 2 of 2) Processing model, total=   4.1s


[Parallel(n_jobs=5)]: Done 100 out of 100 | elapsed:    4.0s finished
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [150]:
# test model
predictions = pipeline.predict(X_test)
predictions_rounded = np.round(predictions)

# show predictions
result_df = pd.DataFrame({
    'Sản phẩm': product_test,
    'Số lượng cần nhập': predictions_rounded.astype(int)
})

# In bảng kết quả
result_df

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,Sản phẩm,Số lượng cần nhập
4353,Gối chống trào ngược trẻ em Mochee hình gấu 70...,1037
3209,Đồ chơi vận động tháp bóng lăn Sato,910
4954,Túi treo xe đẩy rút dây họa tiết gấu màu đen S17,506
7487,Set bao tay chân bo KidsPlaza TM21 (Hồng),1045
585,Ghế ăn dặm đa năng Nimo Little Friend 228 gấu con,847
...,...,...
7366,Yếm tròn Otis bé trai in gấu KidsPlaza ND22T,338
4725,Sữa bầu Morinaga vị trà xanh 216g,393
1153,Bô gấu Việt Nhật Hokori 5458,869
8003,Bộ cộc tay bé gái KidsPlaza cổ có nơ TT24H (Vàng),502


In [151]:
result_df.to_csv("predictions.csv", index=False) 