In [1]:
from utils import *
from prompt.response_prompt import *
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from transformers import BertModel, BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /data/chenxi/anaconda3/envs/myenv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
df = arff_to_dataframe("/data/chenxi/llm-feature-engeneering/dataset/diabetes.arff")

In [3]:

models= {
    'Logistic Regression': LogisticRegression(max_iter=1000000),
    'K-Nearest Neighbors': KNeighborsClassifier(algorithm='ball_tree'),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(probability=True),  # Enable probability estimates
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}



In [5]:
df['analysis'] = pd.read_csv('analysis.csv')

In [6]:
print(df.shape)

(768, 10)


In [7]:
print(df["analysis"].shape)

(768,)


In [4]:
prompts = df.apply(lambda row: template.format(**row), axis=1)
df['analysis']= prompts.apply(lambda x: decoder_for_gpt3(x, max_length = 1000))
# Save the dataframe with the generated summaries to a new CSV file
df['analysis'].to_csv('analysis.csv', index=False)

Timeout error occurred: . Retrying in 60 seconds...


KeyboardInterrupt: 

In [8]:
prompts_sum = df['analysis'].apply(lambda row: template_for_sum.format(analysis=row))
df['sum'] = prompts.apply(lambda x: decoder_for_gpt3(x, max_length = 1000))
df['sum'].to_csv('sum.csv', index=False)

Timeout error occurred: . Retrying in 60 seconds...


In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

df['text_vector'] = df['response'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    
def explode( col, prefix ):
    n_cols = len( col[0] )
    col_names = [ prefix + str(i) for i in range(n_cols) ]

    return( pd.DataFrame( col.to_list(), columns=col_names) )

tab_vec_name = 'text_vector'
prefix = "vec_" 

# train_X
exploded = explode( df[ tab_vec_name], prefix )
df.loc[:, exploded.columns ] = exploded   # Idempotent replacement

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np

def evaluate_models(df, models):

    X = df.drop('Outcome', axis=1)
    y = df['Outcome']

    # Initialize multiple feature selection methods
    feature_selection_methods = {
        'SelectKBest': SelectKBest(mutual_info_classif, k='all'),
    }

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize an empty dictionary to store the selected features from each method
    selected_features = {}

    # Apply each feature selection method to the embeddings
    for name, method in feature_selection_methods.items():
        selected_features[name] = method.fit_transform(X_scaled, y)

    # Train a model (for example, logistic regression) on the selected features and compute the performance
    model = LogisticRegression()
    scores = {}
    for name, features in selected_features.items():
        score = cross_val_score(model, features, y, cv=5, scoring='accuracy').mean()
        scores[name] = score

    # Determine the best feature selection method
    best_method = max(scores, key=scores.get)

    # Use the selected features from the best method for further analysis
    X_selected = selected_features[best_method]

    # Prepare cross-validationv
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize a dictionary to store the performance metrics for each model
    performance_metrics = {
        'accuracy': {model_name: [] for model_name in models.keys()},
        'roc_auc': {model_name: [] for model_name in models.keys()},
    }

    # Train the models and compute the performance metrics
    for name in models.keys():
        model = models[name]
        accuracy_scores = cross_val_score(model, X_selected, y, cv=kfold, scoring='accuracy')
        roc_auc_scores = cross_val_score(model, X_selected, y, cv=kfold, scoring='roc_auc')
        
        # Store the scores in the performance metrics dictionary
        performance_metrics['accuracy'][name] = accuracy_scores
        performance_metrics['roc_auc'][name] = roc_auc_scores

        # Print the mean and standard deviation of the scores
        print(f'{name}:')
        print(f'Accuracy: {accuracy_scores.mean()} ± {accuracy_scores.std()}')
        print(f'ROC AUC: {roc_auc_scores.mean()} ± {roc_auc_scores.std()}')
        print()

    # Prepare colors
    colors = ['black', 'green', 'blue', 'red']

    for metric in ['accuracy', 'roc_auc']:
        # Plot the performance metrics
        plt.figure(figsize=(15, 10))

        x_ticks_positions = np.arange(len(models))  # change here
        data_to_plot = [performance_metrics[metric][model_name] for model_name in models.keys()]
        boxplot = plt.boxplot(data_to_plot, positions=x_ticks_positions, widths=0.6, patch_artist=True,
                            boxprops=dict(facecolor=colors[0], color=colors[0], alpha=0.6),
                            capprops=dict(color=colors[0]),
                            whiskerprops=dict(color=colors[0]),
                            flierprops=dict(color=colors[0], markeredgecolor=colors[0]),
                            medianprops=dict(color='black'))

        plt.title(f"Model performance ({metric})")
        plt.ylabel(metric)
        plt.xticks(ticks=x_ticks_positions, labels=models.keys())  # change here
        plt.show()

evaluate_models(df, models)


<bound method NDFrame.head of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50 