In [None]:
!pip install scikit-learn

In [1]:
import pandas as pd
import numpy as np
import sklearn
import sys
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Read data into three dataframes with pandas
sdpp = pd.read_csv('Shear-Driven_Polymer_Precipitation.csv', usecols=['Polymer','MW','Concentration','Solvent','Nonsolvent','Morphology'])
#sdpp = sdpp.dropna()
solvent_df = pd.read_csv('Solvent_Information.csv', usecols=['Solvent','Solvent HSPd','Solvent HSPp','Solvent HSPh','Solvent Viscosity','Solvent Density','Solvent MW'])
#solvent_df = solvent_df.dropna()
polymer_df = pd.read_csv('Polymer_Information.csv', usecols=['Polymer','Polymer HSPd','Polymer HSPp','Polymer HSPh','Polymer Density','R0'])
#polymer_df = polymer_df.dropna()

In [4]:
#Combine Three Datasets into Single DataFrame
combo = pd.merge(left=sdpp, right=solvent_df, how='inner', on='Solvent')
combined = pd.merge(left=combo, right=polymer_df, how='inner', on='Polymer')

combined['Nonsolvent Viscosity'] = pd.Series(dtype=float)
for i in range(len(combined)):
    nonsolvent = combined['Nonsolvent'][i]
    combined.loc[i, 'Nonsolvent Viscosity'] = solvent_df['Solvent Viscosity'][solvent_df.Solvent == nonsolvent].values[0]

In [7]:
# Convert wt.% of polymer to volume and mole fraction
combined['Polymer VolFrac'] = pd.Series(dtype=float)
#combined['Polymer MolFrac'] = pd.Series()
#combined['Solvent MolFrac'] = pd.Series()
combined['Solvent VolFrac'] = pd.Series(dtype=float)
#Calculate HSPs of polymer solution taking into account polymer HSPs and concentration
combined['PolySol HSPd'] = pd.Series(dtype=float)
combined['PolySol HSPp'] = pd.Series(dtype=float)
combined['PolySol HSPh'] = pd.Series(dtype=float)

for i in range(len(combined)):
    PolymerWt = combined['Concentration'][i]
    SolventWt = 1 - PolymerWt
    PolymerMolFrac = (PolymerWt / combined['MW'][i]) / ((PolymerWt / combined['MW'][i]) + (SolventWt / combined['Solvent MW'][i]))
    SolventMolFrac = 1- PolymerMolFrac
    PolymerVolFrac = (PolymerWt / combined['Polymer Density'][i]) / ((PolymerWt / combined['Polymer Density'][i]) + (SolventWt / combined['Solvent Density'][i]))
    SolventVolFrac = 1- PolymerVolFrac
    combined.loc[i, 'Polymer VolFrac'] = PolymerVolFrac
    combined.loc[i, 'Solvent VolFrac'] = SolventVolFrac
    combined.loc[i, 'PolySol HSPd'] = (PolymerVolFrac *  combined['Polymer HSPd'][i]) + (SolventVolFrac * combined['Solvent HSPd'][i])
    combined.loc[i, 'PolySol HSPp'] = (PolymerVolFrac *  combined['Polymer HSPp'][i]) + (SolventVolFrac * combined['Solvent HSPp'][i])
    combined.loc[i, 'PolySol HSPh'] = (PolymerVolFrac *  combined['Polymer HSPh'][i])  + (SolventVolFrac * combined['Solvent HSPh'][i])

#Find difference in viscosity between solvent and nonsolvet solutions
combined['Injection Viscosity'] = pd.Series(dtype=float)
#Multiply solvent viscosity times polymer volume fraction times MW^1/2
combined['Injection Viscosity'] = combined['Solvent Viscosity'] * combined['Polymer VolFrac'] * combined['MW']**(1/2)
combined['Viscosity Difference'] = combined['Nonsolvent Viscosity'] - combined['Injection Viscosity']
combined['Viscosity Ratio'] = combined['Nonsolvent Viscosity'] / combined['Injection Viscosity']

##Viscosity Ratio????

##Calculate HSP distance and Polymer Solution Viscosity
combined['Ra'] = ((4*(combined['PolySol HSPd']-combined['Solvent HSPd'])**2) + (combined['PolySol HSPp']-combined['Solvent HSPp'])**2
                  + combined['PolySol HSPh']-combined['Solvent HSPh']**2)**1/2
combined['RED'] = combined['Ra'] / combined['R0']

cols_at_end = ['Morphology']
combined = combined[[c for c in combined if c not in cols_at_end]
        + [c for c in cols_at_end if c in combined]]

#combined.to_csv('SDPP_combined.csv')
combined.dropna()
#combined.to_csv()

Unnamed: 0,Polymer,MW,Concentration,Solvent,Nonsolvent,Solvent HSPd,Solvent HSPp,Solvent HSPh,Solvent Viscosity,Solvent Density,...,Solvent VolFrac,PolySol HSPd,PolySol HSPp,PolySol HSPh,Injection Viscosity,Viscosity Difference,Viscosity Ratio,Ra,RED,Morphology
0,PS,230.0,0.010,THF,Water,16.8,5.7,8.0,0.46,0.886,...,0.991130,16.816853,5.701774,7.960084,0.061880,0.828120,14.382658,-28.019388,-2.206251,1
1,PS,230.0,0.015,THF,Water,16.8,5.7,8.0,0.46,0.886,...,0.986687,16.825294,5.702663,7.940093,0.092873,0.797127,9.582967,-28.028671,-2.206982,2
2,PS,230.0,0.030,THF,Water,16.8,5.7,8.0,0.46,0.886,...,0.973329,16.850675,5.705334,7.879980,0.186065,0.703935,4.783276,-28.054860,-2.209044,3
3,PS,230.0,0.100,THF,Water,16.8,5.7,8.0,0.46,0.886,...,0.910378,16.970281,5.717924,7.596702,0.625223,0.264777,1.423492,-28.143497,-2.216023,4
4,PS,230.0,0.200,THF,Water,16.8,5.7,8.0,0.46,0.886,...,0.818666,17.144535,5.736267,7.183995,1.265033,-0.375033,0.703539,-28.169936,-2.218105,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,CS,120.0,0.005,Water,EtOH,15.6,16.0,42.3,0.89,1.000,...,0.995000,15.631500,16.082500,42.211500,0.048747,1.031253,22.155070,-873.533862,-109.191733,1
67,CS,120.0,0.010,Water,EtOH,15.6,16.0,42.3,0.89,1.000,...,0.990000,15.663000,16.165000,42.123000,0.097495,0.982505,11.077535,-873.561949,-109.195244,2
68,CS,120.0,0.020,Water,EtOH,15.6,16.0,42.3,0.89,1.000,...,0.980000,15.726000,16.330000,41.946000,0.194989,0.885011,5.538767,-873.585798,-109.198225,3
69,CS,120.0,0.030,Water,EtOH,15.6,16.0,42.3,0.89,1.000,...,0.970000,15.789000,16.495000,41.769000,0.292484,0.787516,3.692512,-873.566545,-109.195818,4


In [8]:
#Maybe Dont Use Chitosan? -- HSP values seem suspect
combined = combined[combined['Polymer'] != 'CS']

In [9]:
combined.columns

Index(['Polymer', 'MW', 'Concentration', 'Solvent', 'Nonsolvent',
       'Solvent HSPd', 'Solvent HSPp', 'Solvent HSPh', 'Solvent Viscosity',
       'Solvent Density', 'Solvent MW', 'Polymer HSPd', 'Polymer HSPp',
       'Polymer HSPh', 'Polymer Density', 'R0', 'Nonsolvent Viscosity',
       'Polymer VolFrac', 'Solvent VolFrac', 'PolySol HSPd', 'PolySol HSPp',
       'PolySol HSPh', 'Injection Viscosity', 'Viscosity Difference',
       'Viscosity Ratio', 'Ra', 'RED', 'Morphology'],
      dtype='object')

In [10]:
y = combined[['Morphology']] # Deleting .drop below 27

In [22]:
# Testing dropping different columns
X = combined.drop(['Morphology', 'MW', 'Polymer VolFrac', 'Injection Viscosity', 'Polymer', 'Concentration', 'Solvent', 'Nonsolvent','Solvent HSPd', 'Solvent HSPp', 'Solvent HSPh',
                   'Polymer HSPd', 'Polymer HSPp', 'Polymer HSPh', 'Solvent VolFrac','Solvent Viscosity', 'Nonsolvent Viscosity',
                   'Polymer Density','Solvent Density', 'Ra', 'R0', 'Solvent MW', 'PolySol HSPd', 'PolySol HSPp', 'PolySol HSPh', 'Viscosity Difference'], axis=1)

In [23]:
X.head(20)

Unnamed: 0,Viscosity Ratio,RED
0,14.382658,-2.206251
1,9.582967,-2.206982
2,4.783276,-2.209044
3,1.423492,-2.216023
4,0.703539,-2.218105
5,7.766419,-4.584888
6,1.549803,-4.589392
7,0.772725,-4.581561
8,24.834299,-1.654479
9,4.924252,-1.655214


In [21]:
from sklearn import neighbors, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33)

In [25]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [26]:
X_train = scaler.transform(X_train)

In [27]:
X_test = scaler.transform(X_test)

In [None]:
# SGD Classifier


In [28]:
#K Nearest Neighbors Algorithm

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred= knn.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(knn.score(X_test, y_test))

#Random guess would produce correct morpolhoy ~20% of the time (1-5)
#50% accuracy with minimal data isn't horrible

0.3181818181818182
0.3181818181818182


  return self._fit(X, y)


In [29]:
#Linear Regression Algorithm

lr = LinearRegression(normalize=True)
lr.fit(X,y)
y_pred = lr.predict(X_test)
#accuracy_score(y, y_pred)

TypeError: LinearRegression.__init__() got an unexpected keyword argument 'normalize'

In [30]:
#Support Vector Machine Algorithm

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

  y = column_or_1d(y, warn=True)


0.5

In [31]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [35]:
# Testing different models
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

#Regression models
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR


In [37]:
#average classifier scores over 5 training cycles
counter = 1
tally = []
while counter<6:
    h = .02  # step size in the mesh
    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
             "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
             "Naive Bayes", "QDA", "Logistic Regression", "SDGC Classifier",
             "Gradient Boosting Classifier", "LGBM Classifier", "XGB Classifier",
             "LGBM Regressor", "XGB Regressor", "Cat Boost Regressor",
             "SGD Regressor", "Kernel Ridge", "ElasticNet", "Bayesian Ridge", "Gradient Boosting Regressor",
             "SVR"]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(),
        SGDClassifier(),
        GradientBoostingClassifier(),
        LGBMClassifier(),
        XGBClassifier(),
        LGBMRegressor(),
        XGBRegressor(),
        CatBoostRegressor(),
        SGDRegressor(),
        KernelRidge(),
        ElasticNet(),
        BayesianRidge(),
        GradientBoostingRegressor(),
        SVR()]


    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.35, random_state=42)

    # Adjust y_train and y_test to be 0-indexed
    y_train = y_train - 1
    y_test = y_test - 1


    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
    #     ax = plt.subplot(1, len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        tally.append([name, score])
        print(name, score*100)

    print('\n')
    counter +=1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
932:	learn: 0.0776704	total: 222ms	remaining: 16ms
933:	learn: 0.0774127	total: 222ms	remaining: 15.7ms
934:	learn: 0.0773598	total: 222ms	remaining: 15.5ms
935:	learn: 0.0769679	total: 223ms	remaining: 15.2ms
936:	learn: 0.0768836	total: 223ms	remaining: 15ms
937:	learn: 0.0768318	total: 223ms	remaining: 14.7ms
938:	learn: 0.0765810	total: 223ms	remaining: 14.5ms
939:	learn: 0.0763890	total: 223ms	remaining: 14.3ms
940:	learn: 0.0759865	total: 224ms	remaining: 14ms
941:	learn: 0.0759283	total: 224ms	remaining: 13.8ms
942:	learn: 0.0756817	total: 224ms	remaining: 13.5ms
943:	learn: 0.0754931	total: 224ms	remaining: 13.3ms
944:	learn: 0.0753068	total: 224ms	remaining: 13.1ms
945:	learn: 0.0749887	total: 224ms	remaining: 12.8ms
946:	learn: 0.0749398	total: 225ms	remaining: 12.6ms
947:	learn: 0.0745558	total: 225ms	remaining: 12.3ms
948:	learn: 0.0743170	total: 225ms	remaining: 12.1ms
949:	learn: 0.0741355	total: 225ms	remai

In [None]:
counter = 1
tally = []
while counter<6:
  h = .02  # step size in the mesh

  X = StandardScaler().fit_transform(X)
  X_train, X_test, y_train, y_test = \
      train_test_split(X, y, test_size=.35, random_state=42)

  x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
  y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))


  model = classifiers[17]
  model.fit(X_train, y_train)
  score = clf.score(X_test, y_test)
  tally.append([name, score])
  print(model, score*100)

  print('\n')
  counter +=1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
19:	learn: 1.2621260	total: 21.7ms	remaining: 1.06s
20:	learn: 1.2527588	total: 22.7ms	remaining: 1.06s
21:	learn: 1.2367039	total: 23.7ms	remaining: 1.05s
22:	learn: 1.2244142	total: 24.5ms	remaining: 1.04s
23:	learn: 1.2097824	total: 25.6ms	remaining: 1.04s
24:	learn: 1.2005022	total: 26.7ms	remaining: 1.04s
25:	learn: 1.1847162	total: 27.2ms	remaining: 1.02s
26:	learn: 1.1727770	total: 28.3ms	remaining: 1.02s
27:	learn: 1.1603844	total: 29.3ms	remaining: 1.02s
28:	learn: 1.1522415	total: 30.6ms	remaining: 1.02s
29:	learn: 1.1417288	total: 31.9ms	remaining: 1.03s
30:	learn: 1.1291380	total: 33ms	remaining: 1.03s
31:	learn: 1.1175689	total: 34.1ms	remaining: 1.03s
32:	learn: 1.1052817	total: 35ms	remaining: 1.02s
33:	learn: 1.0934252	total: 38.2ms	remaining: 1.09s
34:	learn: 1.0838983	total: 39.3ms	remaining: 1.08s
35:	learn: 1.0764652	total: 40.6ms	remaining: 1.09s
36:	learn: 1.0675536	total: 41.7ms	remaining: 1.09s
37:

In [None]:
combined.to_csv('SDPP_combined.csv')

In [None]:
np.set_printoptions(threshold=np.inf)
print(X)

[[ 0.53629117 -0.80169316 -0.36245076 -1.25859863 -0.52625509 -1.00780124
  -1.04491282  0.20998813  0.32002328 -1.20725488 -1.18635872 -0.93652664
   0.81611103 -0.63581076 -0.77798693  0.77798693 -0.44018824 -1.27096062
  -0.52532923 -0.69173077  0.64869837  0.64869837  0.45987292  0.50962286
  -1.3771843  -0.37139068  1.28062485 -0.21821789 -0.31622777 -0.25400025
  -0.42257713 -0.21821789 -0.34444748 -0.70710678 -0.21821789 -0.21821789
   1.84390889 -0.42257713  0.63581076]
 [ 0.53629117 -0.7447988  -0.36245076 -1.25859863 -0.52625509 -1.00780124
  -1.04491282  0.20998813  0.32002328 -1.20725488 -1.18635872 -0.93652664
   0.81611103 -0.63581076 -0.71694762  0.71694762 -0.43208903 -1.27077794
  -0.52722264 -0.65589225  0.61170706  0.61170706  0.45984198  0.50959709
  -0.70884486 -0.37139068  1.28062485 -0.21821789 -0.31622777 -0.25400025
  -0.42257713 -0.21821789 -0.34444748 -0.70710678 -0.21821789 -0.21821789
   1.84390889 -0.42257713  0.63581076]
 [ 0.53629117 -0.57411575 -0.36245

In [None]:
print(X.head)

AttributeError: ignored

In [None]:
dataset = pd.read_csv('SDPP_combined.csv')
X = dataset.iloc[:, np.r_[2, 6:11, 12:16, 19,20, 21,22, 25,27]].values # don't use Ra, use viscosity ratio, Keep polysol, No R0, No polymer VolFrac,Keep solvent MW, delete categoricaldata
y = dataset.iloc[:, -1].values

In [None]:
print(combined.head)

<bound method NDFrame.head of    Polymer     MW  Concentration  ...          Ra        RED  Morphology
0       PS  230.0          0.010  ...  -28.019388  -2.206251           1
1       PS  230.0          0.015  ...  -28.028671  -2.206982           2
2       PS  230.0          0.030  ...  -28.054860  -2.209044           3
3       PS  230.0          0.100  ...  -28.143497  -2.216023           4
4       PS  230.0          0.200  ...  -28.169936  -2.218105           5
..     ...    ...            ...  ...         ...        ...         ...
61    PVOH   15.0          0.050  ... -874.155092 -83.252866           1
62    PVOH   15.0          0.100  ... -874.776843 -83.312080           2
63    PVOH   15.0          0.200  ... -875.896590 -83.418723           2
64    PVOH   15.0          0.300  ... -876.835634 -83.508156           3
65    PVOH   15.0          0.400  ... -877.573369 -83.578416           3

[66 rows x 28 columns]>


In [None]:
#X = pd.get_dummies(combined, prefix_sep='_', drop_first=True)
print(X.columns.tolist())

AttributeError: ignored

In [None]:
X.drop[['Morphology']]

TypeError: ignored

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.23, random_state = 0)

In [None]:
print(X.shape)

(66, 15)


In [18]:
!pip install scikit-learn

