## predictor 3 Gradient Boosting

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('entrenamiento_lsp.csv')

In [3]:
data

Unnamed: 0,text,esia
0,"['verse', '1', 'feisty', 'beyoncé', 'im', 'a',...",1
1,"['alaina', 'sixtrye', 'is', 'a', 'human', 'fem...",1
2,"['the', 'ora01031', 'error', 'can', 'occur', '...",1
3,"['nours', 'attitude', 'in', 'this', 'respect',...",0
4,"['in', 'life', 'whatever', 'situation', 'it', ...",0
...,...,...
1004514,"['verse', '1', 'in', 'a', 'small', 'town', 'in...",1
1004515,"['when', 'someone', 'does', 'something', 'wron...",1
1004516,"['rnli', 'is', 'spending', '44m', 'to', 'repla...",1
1004517,"['instead', 'of', 'using', 'paddingleft', 'on'...",1


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Tomar una muestra representativa de tus datos (por ejemplo, el 10% de ellos)
sample_data = data.sample(frac=0.10, random_state=42)

# División de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(sample_data['text'], sample_data['esia'], test_size=0.2, random_state=42)

# Convertir texto a características numéricas usando TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Entrenar un modelo de regresión logística con la muestra
clf_lr = LogisticRegression()
clf_lr.fit(X_train_vec, y_train)

# Predicciones y evaluación para regresión logística
y_pred_lr = clf_lr.predict(X_test_vec)
print("Resultados Regresión Logística:")
print(classification_report(y_test, y_pred_lr))

# Entrenar un modelo SVM con la muestra
clf_svm = SVC(kernel='linear')
clf_svm.fit(X_train_vec, y_train)

# Predicciones y evaluación para SVM
y_pred_svm = clf_svm.predict(X_test_vec)
print("Resultados SVM:")
print(classification_report(y_test, y_pred_svm))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.91      0.88      0.89     58794
           1       0.95      0.96      0.96    142110

    accuracy                           0.94    200904
   macro avg       0.93      0.92      0.92    200904
weighted avg       0.94      0.94      0.94    200904



In [5]:
def probabilidad_ia(texto, vectorizer, clasificador):
    # Transformar el texto con el vectorizador
    texto_vec = vectorizer.transform([texto])
    
    # Obtener las probabilidades
    prob = clasificador.predict_proba(texto_vec)
    
    # Devolver la probabilidad de que sea IA
    return prob[0][1]

texto_nuevo = "Marguerite Priola (1849–1876) was a French operatic soprano. She made her debut in 1869 in Paris as the Messenger of Peace in the first French production of Wagner's Rienzi at the Théâtre Lyrique. She enjoyed a successful career at the Opéra-Comique until 1874, performing mainly coloratura soprano roles. There she created several roles, including Princess Elsbeth in Offenbach's Fantasio, Maritana in Massenet's Don César de Bazan, and Javotte in Le roi l'a dit by Delibes. In 1876, she joined the Opéra de Marseille, where she appeared as Philine in Mignon by Ambroise Thomas. Unable to use her voice to its full potential due to illness, she was booed throughout the performance. The illness developed into a serious outbreak of typhoid fever and she died three weeks later at the age of 27. This 1873 portrait, taken by the French photographer Alexandre Quinet, shows Priola in her role in Le Roi l'a dit."
prob = probabilidad_ia(texto_nuevo, vectorizer, clf)
print(f"Probabilidad de que el texto haya sido generado por una IA: {prob*100:.2f}%")



Probabilidad de que el texto haya sido generado por una IA: 16.70%


In [6]:
historical_text = "The French Revolution began in 1789 and lasted until the rise of Napoleon Bonaparte in 1799."


In [7]:
sci_fi_text = "By the year 3050, humans had established colonies on Mars and were beginning to terraform the planet."


In [8]:
technical_text = "Machine learning is a branch of artificial intelligence that focuses on the development of algorithms that allow machines to learn from data."


In [9]:
poetic_text = "Beneath the mantle of the night sky, the stars shine with a gleam reflecting humanity's dreams."


In [10]:
ai_text = "The internet of emotions has become the next evolutionary step in technology, where devices can detect and respond to human emotions in real-time."


In [11]:
texts = [historical_text, sci_fi_text, technical_text, poetic_text, ai_text]
names = ["Historical", "Science Fiction", "Technical", "Poetic", "Generated by AI"]

for name, text in zip(names, texts):
    prob = probabilidad_ia(text, vectorizer, clf)
    print(f"Probability that the text ({name}) was generated by AI: {prob*100:.2f}%")



Probability that the text (Historical) was generated by AI: 71.11%
Probability that the text (Science Fiction) was generated by AI: 19.82%
Probability that the text (Technical) was generated by AI: 54.46%
Probability that the text (Poetic) was generated by AI: 79.02%
Probability that the text (Generated by AI) was generated by AI: 32.27%


In [15]:
wiki_txt1 = "The SS Princess Alice sank on 3 September 1878 after a collision with the collier vessel SS Bywell Castle on the River Thames. Between 600 and 700 people died, all from the paddle steamer, in the greatest loss of life of any British inland waterway shipping accident. Princess Alice was owned by the London Steamboat Co and captained by William R. H. Grinstead."
wiki_txt2 = "Marguerite Priola (1849–1876) was a French operatic soprano. She made her debut in 1869 in Paris as the Messenger of Peace in the first French production of Wagner's Rienzi at the Théâtre Lyrique. She enjoyed a successful career at the Opéra-Comique until 1874, performing mainly coloratura soprano roles. There she created several roles, including Princess Elsbeth in Offenbach's Fantasio, Maritana in Massenet's Don César de Bazan, and Javotte in Le roi l'a dit by Delibes"
wiki_txt3 = "Verdi was commissioned by the Teatro La Fenice in Venice to write an opera, but finding the right subject took some time, and the composer worked with the inexperienced Piave in shaping first one and then another drama by Hugo into an acceptable libretto. As musicologist Roger Parker notes, the composer intervened on several important points, insisting for example that the role of Ernani be sung by a tenor (rather than by a contralto as had originally been planned)"
wiki_txt4 = "Alzira is an opera in a prologue and two acts by Giuseppe Verdi to an Italian libretto by Salvatore Cammarano, based on the 1736 play Alzire, ou les Américains by Voltaire."

In [16]:
texts = [wiki_txt1, wiki_txt2, wiki_txt3, wiki_txt4]
names = ["wiki_txt1", "wiki_txt2", "wiki_txt3", "wiki_txt4"]

for name, text in zip(names, texts):
    prob = probabilidad_ia(text, vectorizer, clf)
    print(f"Probability that the text ({name}) was generated by AI: {prob*100:.2f}%")

Probability that the text (wiki_txt1) was generated by AI: 16.46%
Probability that the text (wiki_txt2) was generated by AI: 15.87%
Probability that the text (wiki_txt3) was generated by AI: 14.73%
Probability that the text (wiki_txt4) was generated by AI: 18.55%


In [17]:
texts = [wiki_txt1, wiki_txt2, wiki_txt3, wiki_txt4]
names = ["wiki_txt1", "wiki_txt2", "wiki_txt3", "wiki_txt4"]

for name, text in zip(names, texts):
    prob = probabilidad_ia(text, vectorizer, clf)
    print(f"Probability that the text ({name}) was generated by AI: {prob*100:.2f}%")

Probability that the text (wiki_txt1) was generated by AI: 16.46%
Probability that the text (wiki_txt2) was generated by AI: 15.87%
Probability that the text (wiki_txt3) was generated by AI: 14.73%
Probability that the text (wiki_txt4) was generated by AI: 18.55%
