<h2>Análise de Gênero de Autores:</h2>
    <li>Fazer um experimento de classificação de texto e utilizar as métricas de classificação
    <li>Quem escreve mais páginas, homens ou mulheres ou não há diferença? (Fazer um gráfico)
    <li>Como é a distribuição de gênero de livro para homens e mulheres? (Fazer um gráfico)
    <li>Com todos os atributos, fazer um experimento para identificar gênero do autor. Não utilize a coluna gênero do autor nem nome do autor nem id do autor como atributo.
    <li>Fazer alguma visualização com wordcloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('good_reads_final.csv')

In [None]:
df.columns

<h2>Limpando os Dados

In [None]:
df.drop(columns=['book_fullurl','author_page_url'], inplace=True)

In [None]:
df['pages'] = np.where(df['pages'] == '1 page', 1, df['pages'])
df['birthplace'] = df['birthplace'].str.replace('\\n', '')
df['author_name'] = df['author_name'].str.replace('\\n', '')
df['book_title'] = df['book_title'].str.replace('\\n', '')
df.publish_date = df.publish_date.str.extract('(^[\d]{1,4})').astype('category')

In [None]:
df['author_average_rating'] = df['author_average_rating'].astype(float)
df['author_gender'] = df['author_gender'].astype(str).astype('category')
df['author_genres'] = df['author_genres'].astype(str).astype('category')
df['author_id'] = df['author_id'].astype(int).astype('category')
df['author_name'] = df['author_name'].astype(str)
df['author_rating_count'] = df['author_rating_count'].astype(int)
df['author_review_count'] = df['author_review_count'].astype(int)
df['book_id'] = df['book_id'].astype(str).astype('category')
df['birthplace'] = df['birthplace'].astype(str)
df['book_average_rating'] = df['book_average_rating'].astype(float)
df['book_title'] = df['book_title'].astype(str)
df['genre_1'] = df['genre_1'].astype(str).astype('category')
df['genre_2'] = df['genre_2'].astype(str).astype('category')
df['num_ratings'] = df['num_ratings'].astype(int)
df['num_reviews'] = df['num_reviews'].astype(int)
df['pages'] = df['pages'].astype(int)
df['score'] = df['score'].astype(float)

In [None]:
df.head()

In [None]:
df.groupby('author_gender',observed=True).count()['pages'].plot(kind='bar',figsize=(12,5)).set_title('Total de páginas por gênero')

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df.dropna().groupby(['author_gender','publish_date'],observed=True,as_index=True).sum(),x='publish_date',y='pages',hue='author_gender').set_title('Total de páginas por gênero e ano de publicação')
plt.xticks([])

In [None]:
plt.figure(figsize=(30,5))
sns.boxplot(x='genre_1',y='pages',data=df,hue='author_gender', palette='rainbow')
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(30,5))
sns.boxplot(x='genre_2',y='pages',data=df,hue='author_gender', palette='rainbow')
plt.xticks(rotation=90)

In [None]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['genre_1'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['book_title'], df['target'], test_size=0.2, random_state=42)

In [None]:
def preprocess_text(text):
    return [token for token in simple_preprocess(remove_stopwords(text)) if token not in ['']]

In [None]:
train_data = [preprocess_text(text) for text in X_train]
test_data = [preprocess_text(text) for text in X_test]

In [None]:
print(train_data[0], y_train.iloc[0])
print(test_data[0], y_test.iloc[0])

In [None]:
train_tagged = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(train_data)]
test_tagged = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(test_data)]

In [None]:
print(train_tagged[0], y_train.iloc[0])
print(test_tagged[0], y_test.iloc[0])

In [None]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_tagged)
model.train(train_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
train_vectors = [model.infer_vector(doc.words) for doc in train_tagged]
test_vectors = [model.infer_vector(doc.words) for doc in test_tagged]

In [None]:
print(train_vectors[0], y_train.iloc[0])
print(test_vectors[0], y_test.iloc[0])

<h3>Tuning

In [None]:
param_space = {
    "LogisticRegression": {
        "classifier": [LogisticRegression()],
        "classifier__C": np.logspace(-4, 4, 20),
        "classifier__penalty": ["l1", "l2", "elasticnet", "none"],
        "classifier__solver": ["lbfgs", "liblinear", "saga"]
    },
    "KNeighborsClassifier": {
        "classifier": [KNeighborsClassifier()],
        "classifier__n_neighbors": range(1, 31),
        "classifier__weights": ["uniform", "distance"],
        "classifier__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
        "classifier__leaf_size": range(1, 51)
    },
    "DecisionTreeClassifier": {
        "classifier": [DecisionTreeClassifier()],
        "classifier__max_depth": range(1, 31),
        "classifier__criterion": ["gini", "entropy"],
        "classifier__splitter": ["best", "random"]
    },
    "RandomForestClassifier": {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": range(10, 101, 10),
        "classifier__criterion": ["gini", "entropy"],
        "classifier__max_depth": range(1, 31),
        "classifier__min_samples_split": range(2, 11),
        "classifier__min_samples_leaf": range(1, 6),
        "classifier__bootstrap": [True, False]
    },
    "GradientBoostingClassifier": {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": range(10, 101, 10),
        "classifier__learning_rate": np.logspace(-4, 0, 10),
        "classifier__max_depth": range(1, 31),
        "classifier__min_samples_split": range(2, 11),
        "classifier__min_samples_leaf": range(1, 6),
        "classifier__subsample": np.arange(0.5, 1.0, 0.1)
    },
    "SVC": {
        "classifier": [SVC()],
        "classifier__C": np.logspace(-4, 4, 20),
        "classifier__kernel": ["linear", "rbf", "poly", "sigmoid"],
"classifier__degree": range(1, 6),
"classifier__gamma": ["scale", "auto"]
}
}   


<h3>Models

In [None]:
classifiers = [
LogisticRegression(),
KNeighborsClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
GradientBoostingClassifier(),
SVC(),
]

In [None]:
results = []
for clf in classifiers:
    pipe = Pipeline([('classifier', clf)])

In [None]:
random_search = RandomizedSearchCV(pipe, param_space[clf.__class__.__name__], n_iter=25, cv=3, n_jobs=-1, random_state=42)
random_search.fit(train_vectors, y_train)

In [None]:
y_pred = random_search.predict(test_vectors)
accuracy = accuracy_score(y_test, y_pred)
results.append({
    "model": clf.__class__.__name__,
    "best_params": random_search.best_params_,
    "accuracy": accuracy
})

In [None]:
for result in results:
    print("Model:", result["model"])
    print("Best params:", result["best_params"])
    print("Accuracy:", result["accuracy"])
    print()