In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
plt.rc('font', family='NanumGothic')

In [None]:
df=pd.read_csv('users.csv')

In [None]:
df.head()

In [None]:
data=df[['age','gender','skin_type']]

In [None]:
plt.subplots(figsize=(20,5))
sns.distplot(data.age, hist=False, label='1', kde=True, rug=True)

In [None]:
sns.countplot(data=data, x='gender', hue='gender')

In [None]:
sns.countplot(data=data, x='skin_type', hue='skin_type')

In [None]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
traceW = go.Box(x = data['age'].values,
                name="Width",
                marker=dict(color='rgba(238,23,11,0.5)',
                            line=dict(color='red', width=1.2)),
                orientation='h')

                
box = [traceW]

layout = dict(title = 'Width & Heights of images',
              xaxis = dict(title = 'Size', showticklabels=True),
              yaxis = dict(title = 'Image dimmension'),
              hovermode = 'closest')

fig = dict(data=box, layout=layout)

plot(fig, filename='width-height.html')

In [None]:
data=data.loc[(data['age']<36)&(data['age']>10)]

In [None]:
plt.subplots(figsize=(20,5))
sns.distplot(data.age, hist=False, label='1', kde=True, rug=True)

In [None]:
from scipy.stats import skew
skew(data['age'])

In [None]:
data = data.merge(pd.get_dummies(data.skin_type, prefix='skin_type'), left_index=True, right_index=True)
data = data.merge(pd.get_dummies(data.gender, prefix='gender'), left_index=True, right_index=True)

In [None]:
del data['skin_type']
del data['gender']

In [None]:
data.columns

In [None]:
from sklearn.preprocessing import StandardScaler
df2 = StandardScaler().fit_transform(data)
df2=pd.DataFrame(df2)

In [None]:
df2.columns=['age', 'skin_type_건성', 'skin_type_민감성', 'skin_type_복합성', 'skin_type_중성',
       'skin_type_지성', 'gender_1', 'gender_2']

In [None]:
df2.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score

range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10]

for n_clusters in range_n_clusters:


    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(df2)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(df2, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)


In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt



# k means determine k
distortions = []
K = range(1,15)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(df2)
    kmeanModel.fit(df2)
    distortions.append(sum(np.min(cdist(df2, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / df2.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
n_clusters = [2,3,4,5,6,7,8,10,20,79]

def compare_k_means(k_list,df2):
    ## Run clustering with different k and check the metrics
    for k in k_list:
        clusterer = KMeans(n_clusters=k, n_jobs=4)
        clusterer.fit(df2)
        ## The higher (up to 1) the better
print("Silhouette Coefficient for k == %s: %s" % (k, round(metrics.silhouette_score(df2, clusterer.labels_), 4)))


In [None]:
kmeans = KMeans(n_clusters=6) # You want cluster the passenger records into 2: Survived or Not survived
predict=kmeans.fit(df2)

In [None]:
out=pd.DataFrame(kmeans.labels_)
out.columns=['predict']
data2=pd.concat([df2,out],1)

In [None]:
data2

In [None]:
from mpl_toolkits.mplot3d import Axes3D
# scatter plot
fig = plt.figure( figsize=(6,6))
ax = Axes3D(fig, rect=[0, 1, 1, 1], elev=48, azim=134)
ax.scatter(data2['age'],data2['skin_type_민감성'],data2['gender_2'],c=data2['predict'],alpha=0.5)
ax.set_xlabel('age')
ax.set_ylabel('skin_type_민감성')
ax.set_zlabel('gender_2')
plt.show()

In [None]:
data2.isna().sum()

In [None]:
X = data2.iloc[:, data2.columns != 'predict']
y = data2.iloc[:, data2.columns == 'predict']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 10)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [None]:
kfold = StratifiedKFold(n_splits=10)

In [None]:
random_state = 2
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = y_train, scoring = "accuracy", cv = kfold, n_jobs=4))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")


In [None]:
y.shape

In [None]:
gbc = GradientBoostingClassifier()
gbc = gbc.fit(X_train, y_train)

In [None]:
gbc_scores = cross_val_score(gbc, X_train, y_train,cv=10)
print('Scores =', gbc_scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(gbc_scores), np.std(gbc_scores)))

In [None]:
from sklearn.metrics import classification_report
y_pred=gbc.predict(X_test)
# classification matrix
print('\nClassification metrics')
print(classification_report(y_true=y_test, y_pred=y_pred))