In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

# Sample Data
data = {'text': [
    'I love machine learning', 
    'Deep learning is amazing', 
    'Natural language processing is fun', 
    'I enjoy coding in Python', 
    'Data science is a great field',
    'This product is of very poor quality. It broke after just one use. I would not recommend it to anyone.',
    'The movie was a total waste of time. The plot was boring and the acting was terrible.',
    'The food was awful and the service was even worse. I will never eat here again.',
    'I had a terrible experience at the hotel. The room was dirty, and the amenities were not as described.',
    'I feel undervalued and overworked in this job. The management does not care about employee well-being.',
    'I am feeling very unhappy and stressed out with how things are going in my life right now.',
    'The event was well-organized and everyone had a great time.',
    'I absolutely love this new app! It makes my life so much easier.',
    'The customer service was excellent. They were very helpful and resolved my issue quickly.',
    'The book was inspiring and beautifully written. I couldn\'t put it down.',
    'The concert exceeded my expectations. The performers were incredible.',
    'The new restaurant in town is fantastic. The food is delicious and the ambiance is perfect.',
    'I hate u'
],
'label': [
    1, 1, 1, 1, 1,  # Positive examples
    0, 0, 0, 0, 0,  # Negative examples
    0, 1, 1, 1, 1, 1, 1 ,0  # Additional mixed sentiment examples
]}

df = pd.DataFrame(data)

# Download NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize Spacy model
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

def objective(trial):
    # Define model type
    model_type = trial.suggest_categorical('model_type', ['logistic_regression', 'svm', 'naive_bayes'])
    
    if model_type == 'logistic_regression':
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        model = LogisticRegression(C=C, max_iter=1000)
    elif model_type == 'svm':
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        model = SVC(C=C)
    else:
        alpha = trial.suggest_float('alpha', 1e-5, 1e0, log=True)
        model = MultinomialNB(alpha=alpha)
    
    # Create a pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('model', model)
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

# Optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best model
best_params = study.best_params
print("Best parameters: ", best_params)

# Extract best parameters
model_type = best_params['model_type']

if model_type == 'logistic_regression':
    best_model = LogisticRegression(C=best_params['C'], max_iter=1000)
elif model_type == 'svm':
    best_model = SVC(C=best_params['C'])
else:
    best_model = MultinomialNB(alpha=best_params['alpha'])

# Create a pipeline
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', best_model)
])

# Train final model
final_pipeline.fit(X_train, y_train)

# Predict
y_pred = final_pipeline.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Final model accuracy: ", accuracy)

# Save the model
joblib.dump(final_pipeline, 'nlp_automl_model.pkl')

# Load the model (for inference)
loaded_model = joblib.load('nlp_automl_model.pkl')

# Example prediction
example_text = ["I love studying data science"]
example_cleaned = [preprocess(text) for text in example_text]
prediction = loaded_model.predict(example_cleaned)
print("Prediction: ", prediction)


ValueError: All arrays must be of the same length