In [1]:
!pip install eli5
import csv
import json

import numpy as np
import pandas as pd
# from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# from transformers import Trainer, TrainingArguments
# from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification

# import torch

from tqdm import tqdm, trange
# from dataset_loader import load

from eli5 import show_weights
import matplotlib.pyplot as plt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# dataset_name = 'NarrativeQA'
# dataset = load(dataset_name, 'datasets/')
GOOGLE_PATH = "drive/MyDrive/si630-project/"
# all_df = pd.read_csv(GOOGLE_PATH + "RTQA-sample.csv")

In [4]:
# train_df  = pd.DataFrame.from_dict(dataset['train'])
# train_df.to_csv("squad_train.csv")
# dev_df = pd.DataFrame.from_dict(dataset['test'])
# dev_df.to_csv("squad_test.csv")
np.random.seed(42)
train_df = pd.read_csv(GOOGLE_PATH + "train_wiki.csv").sample(12000)
dev_df = pd.read_csv(GOOGLE_PATH + "test_wiki.csv").sample(3000)
train_df.shape, dev_df.shape

((12000, 2), (3000, 2))

In [5]:
train_df.head()

Unnamed: 0,text,label
202827,"Cobra Matata (also known as Banaloki Matata, J...",0
151507,Jonas Aukštuolis ( – 28 October 1949) was a Li...,0
163248,"In Etruscan mythology, Tuchulcha was a chthoni...",1
157083,Ontario Motor Speedway was a motorsport venue ...,1
69269,Great! Movies Action (stylized as GREAT! movie...,0


In [6]:
!pip install stanza
import stanza
from collections import defaultdict

def deprel_func(sentences, nlp):
    doc = nlp(sentences)
    deprel_dist = defaultdict(int)
    for sent in doc.sentences:
        for word in sent.words:
            deprel_dist[word.deprel] += 1
    return deprel_dist

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [7]:
deprel_dist = [deprel_func(sent, nlp) for sent in tqdm(train_df['text'])]
keys = set().union(*(d.keys() for d in deprel_dist))

100%|██████████| 12000/12000 [45:02<00:00,  4.44it/s]


In [8]:
deprel_dist_dev = [deprel_func(sent, nlp) for sent in tqdm(dev_df['text'])]
dev_keys = set().union(*(d.keys() for d in deprel_dist))

100%|██████████| 3000/3000 [11:36<00:00,  4.31it/s]


In [9]:
all_keys = keys.union(dev_keys)

train_list = [{k: d.get(k, 0) for k in all_keys} for d in deprel_dist]
dev_list = [{k: d.get(k, 0) for k in all_keys} for d in deprel_dist_dev]

In [10]:
train_syn = pd.DataFrame(train_list)
train_mean = train_syn.mean(axis=0)
train_std = train_syn.std(axis=0)
dev_syn = pd.DataFrame(dev_list)

train_syn = (train_syn - train_mean) / train_std
dev_syn = (dev_syn - train_mean) / train_std

### Dummy Classifier

In [11]:
mf_dummy_clf = DummyClassifier(strategy="most_frequent")
mf_dummy_clf.fit(train_df.text, train_df.label)
mf_dummy_test_preds = mf_dummy_clf.predict(dev_df.text)
precision_recall_fscore_support(dev_df.label, mf_dummy_test_preds, average='binary')

  _warn_prf(average, modifier, msg_start, len(result))


(0.0, 0.0, 0.0, None)

In [12]:
random_dummy_clf = DummyClassifier(strategy="stratified")
random_dummy_clf.fit(train_df.text, train_df.label)
random_dummy_test_preds = random_dummy_clf.predict(dev_df.text)
precision_recall_fscore_support(dev_df.label, random_dummy_test_preds, average='binary')

(0.49222447599729546, 0.49122807017543857, 0.491725768321513, None)

In [13]:
fpr, tpr, threshold = roc_curve(np.array(dev_df.label) , random_dummy_test_preds)
auc(fpr, tpr)

0.4982490812010263

### Sklearn

In [14]:
# Convert text to features using a simple tf-idf
# min_df = is the minimum occurrences for a word to a feature
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_df.text.values.astype('U'))

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = vectorizer.transform(dev_df.text.values.astype('U'))
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')


(0.8985115020297699, 0.8960863697705803, 0.8972972972972973, None)

In [15]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.8986360702607843

In [16]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14781,"Turów Power Station is a coal-fired power station in Bogatynia, Poland. The power station, operated by state-owned Polska Grupa Energetyczna via Oddział Elektrownia Turów, is fuelled by lignite extracted from the nearby Turów coal mine. Operations at the plant began in 1962. As of 2021 it supplied 5% of Poland's electricity and is the sole provider of heat and hot water to hospitals, schools and homes in Bogatynia. The plant initially consisted of ten 200 MW units, commissioned from 1962 to 1971. PGE undertook a US$1.6 billion modernization of units 1-6 of the plant in the early 1990s. Units 7-10 have been phased out. Unit 7 was retired in 2003. In 2010, Unit 8 was retired. Units 9 and 10 were decommissioned in 2012-2013. PGE has repowered Units 5 and 6 to co-incinerate biomass, and plans for co-firing of biomass in boilers 1, 2, 3, and 4. Units 1, 2, and 3 have been upgraded from 200 MW to 235 MW each. The plant's remaining six units have a combined capacity of 1,305 MW.",0,1
10664,"Jean-Guy is a given name. Notable people with the name include: In politics\nJean-Guy Allard (born 1948), Canadian journalist for Le Journal de Montréal and Le Journal de Québec\nJean-Guy Cardinal (1925–1979), nationalist politician in Quebec, Canada\nJean-Guy Carignan BA, MBA (born 1941), member of the Canadian House of Commons from 2000 to 2004\nJean-Guy Chrétien (born 1946), member of the Canadian House of Commons from 1993 to 2000\nJean-Guy Dagenais (born 1950), Canadian politician from Quebec\nJean-Guy Deschamps, former politician in Montreal, Quebec, Canada\nJean-Guy Dubé, Conservative Party of Canada candidate in the 2008 Canadian federal election\nJean-Guy Dubois (born 1948), Liberal party member of the Canadian House of Commons\nJean-Guy Guilbault (born 1931), member of the House of Commons of Canada\nJean-Guy Hudon (born 1941), Progressive Conservative member of the Canadian House of Commons\nJean-Guy Laforest (born 1944), business owner and former political figure in New Brunswick, Canada\nJean-Guy Péloquin, Abolitionist Party of Canada candidate in 1993 Canadian federal election\nJean-Guy Sabourin, Parti créditiste candidate in 1973 Quebec provincial election\nJean-Guy Trépanier, politician in the Quebec, Canada",0,1
56676,"In the year 1977, significant events in radio took place. The year saw the launch of the first FM radio station, WNUR in Newark, New Jersey. The year also saw the advent of satellite radio and the development of digital audio broadcasting.",1,0
25585,"William Brade (1560 – 26 February 1630) was an English poet.\n\nBrade was born in London, the son of a tailor. He was educated at Merchant Taylors' School and then at Brasenose College, Oxford, where he obtained his BA in 1581 and MA in 1585. He also studied law at the Inns of Court, but never practised.\n\nHe married Anne More, the daughter of Sir Thomas More, in 1590. They had four sons and four daughters.\n\nBrade's first published work was a translation of Lucan's ""Pharsalia"" (1592). He is chiefly remembered today for his poems, most notably ""The Shepheardes Calender"" (1579), ""The Garden"" (1585), and ""The Castle of Indolence"" (1616).",1,0
20307,"Harry Bache Smith (December 28, 1860 – January 1, 1936) was a writer, lyricist and composer. The most prolific of all American stage writers, he is said to have written over 300 librettos and more than 6000 lyrics. Some of his best-known works were librettos for the composers Victor Herbert and Reginald De Koven. He also wrote the book or lyrics for several versions of the Ziegfeld Follies. Smith was born in Buffalo, New York to Josiah Bailey Smith (born 1837) and Elizabeth Bach (born 1838). According to his autobiography First Nights and First Editions (Boston: Little, Brown, 1931), Smith's actual name at birth was Henry Bach Smith. He married twice. His first wife was Lena Reed (born August 21, 1868), whom he married on October 12, 1887 in Chicago, Illinois. They had a son named Sydney Reed Smith (born July 15, 1892). Smith's second wife was the actress Irene Bentley (1870 – June 3, 1940). They married on November 23, 1906 in Boston, Massachusetts, after she had been divorced on June 12, 1906 by her first husband James Thomas Sothoron, Jr. (1867–1913). Bentley retired from the stage in 1910 and died at Allenhurst, New Jersey. She is buried in Woodlawn Cemetery in the Bronx, NY. While on a brief holiday in Atlantic City, New Jersey, on New Year's Day in 1936, Smith died of a heart attack in his room at the Marlborough-Blenheim Hotel.",0,1
...,...,...,...
50197,"The Rose Bowl Game is an annual American college football bowl game, usually played on January 1 (New Year's Day) at the Rose Bowl in Pasadena, California. When New Year's Day falls on a Sunday, the game is played on Monday, January 2. The Rose Bowl Game is nicknamed ""The Granddaddy of Them All"" because it is the oldest currently operating bowl game. It was first played in 1902 as the Tournament East–West football game, and has been played annually since 1916. Since 1945, it has been the highest attended college football bowl game. Since 2021, the game has been sponsored by Capital One, and will be officially known as Rose Bowl Game Presented by Capital One Venture X in 2022. Previous sponsors include Northwestern Mutual (2015–2020), Vizio (2011–2014), Citi (2004–2010), Sony/PlayStation 2 (2003), and AT&T (1999–2002). The game is a part of the Pasadena Tournament of Roses Association's ""America's New Year Celebration"", which also includes the historic Rose Parade. Winners of the game received the Leishman Trophy, named for former Tournament of Roses presidents, William L. Leishman and Lathrop K. Leishman who played an important part in the history of this game. The Rose Bowl Game has traditionally hosted the conference champions from the Big Ten and Pac-12 conferences (or their predecessors). Since 2002, the Rose Bowl Game has occasionally deviated from its traditional matchups for use in ""national championship"" systems. In 2002 and 2006 (the 2001 and 2005 seasons), under the Bowl Championship Series (BCS) system, the Rose Bowl was designated as its championship game, and hosted the top two teams determined by the BCS system. Beginning in 2015, the Rose Bowl has been part of the College Football Playoff (CFP) as one of the New Year's Six bowls—the top six major bowl games in the national championship system—hosting one of the semifinal games every three years. During non-CFP years, the Rose Bowl reverts to its traditional Pac-12/Big Ten matchup, unless the champions from those conferences are selected to play in the College Football Playoff.",0,1
56079,"The 2021 Revolution was the second annual international event in Call of Duty: Modern Warfare Remastered, and the seventh overall event in the series. It took place on February 7, 2021, and featured a new map, Revolution, as well as new weapons and equipment.\n\nThe Revolution event introduced a new map, Revolution, which takes place in a fictional country in the Middle East. The map features a variety of tight spaces and elevated areas, which makes for a tense experience. The event also featured a number of new weapons and equipment, including the M-79 EBR and ACR rifles, the MP7 submachine gun, and the XM1014 assault rifle.",1,0
13808,"19 Puppis is a binary star system in the equatorial constellation of Puppis. The primary star is a white dwarf, the shrunken core of a late-type star. The secondary star is a hot blue-white main sequence star.\n\nThe system is approximately 470 light years from Earth, and was first observed by French astronomer J. J. Laskar in 1886. 19 Puppis A was identified as a possible planet by American astronomer W. M. Fick in 1908, and by the radial velocity method in 1915; however, subsequent observations failed to confirm the existence of a planet. In 1969, Japanese astronomer Yūsuke Koyama independently determined that 19 Puppis A had an orbiting companion, 19 Puppis B, using the radial velocity method and photometry.\n\nThe system is notable for its variable light output, which ranges from a magnitude of 6.5 to 11.2 over a period of about 2 hours 47 minutes. This variability is caused by the companion star's gravity tugging on the white dwarf's atmosphere, which causes it to brighten and then dim over time.",1,0
34540,"Hamill is a surname. Notable people with the surname include: Aaron Hamill (born 1977), Australian rules footballer\nAlex Hamill (footballer, born 1961), Scottish footballer\nAlex Hamill (footballer, born 1912), Scottish footballer\nBilly Hamill (born 1970), American motorcycle speedway rider\nBrendan Hamill (disambiguation), several people\nChristine Hamill (1923–1956), English mathematician\nChristopher Hamill (born 1958), better known as Limahl, lead singer of the 1980s English pop group Kajagoogoo\nClaire Hamill (born 1954), English singer-songwriter\nDavid Hamill (born 1957), Queensland Australian Labor Party politician\nDesmond Hamill (1936–2013), British television reporter\nDorothy Hamill (born 1956), American figure skater\nHarry Hamill (1879–1947), Australian rugby footballer\nJames A. Hamill (1877–1941), U.S. Representative from New Jersey\nJim Hamill, singer with The Kingsmen and the Oak Ridge Boys\nJamie Hamill (born 1986), Scottish footballer\nJoe Hamill (born 1984), Scottish footballer\nJohn Hamill (born 1947), English actor\nJohn Hamill (baseball) (1860–1911), American baseball player\nKate Hamill, American actress and playwright\nMatt Hamill (born 1976), American wrestler\nMark Hamill (born 1951), American actor\nMickey Hamill (1889–1943), Irish footballer\nPat Hamill (born 1950), Scottish footballer\nPatrick Hamill (1817–1895), U.S. Representative from Maryland\nPete Hamill (1935–2020), American journalist and writer\nPeter J. Hamill (c. 1885–1930), American politician\nRed Hamill (Robert George Hamill; 1917–1985), Canadian ice hockey player\nRob Hamill (born 1964), New Zealand rower and political candidate\nTommy Hamill (died 1996), Northern Irish footballer\nZach Hamill (born 1988), Canadian ice hockey player",0,1


In [17]:
vocab = vectorizer.vocabulary_
id_to_vocab = {id_: word for word, id_ in vocab.items()}

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+8.083,is
+5.540,has
+5.485,including
+4.789,died
+4.690,served
+4.004,was
+3.946,also
+3.899,began
+3.381,became
+3.298,been


### Sklearn with Syn

In [18]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_df.text.values.astype('U'))
X_train = np.concatenate((X_train.toarray(), train_syn.to_numpy()), axis=1)

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = vectorizer.transform(dev_df.text.values.astype('U'))
X_test = np.concatenate((X_test.toarray(), dev_syn.to_numpy()), axis=1)
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.9084022038567493, 0.8900134952766532, 0.8991138377641444, None)

In [19]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.901199106004598

In [20]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14781,"Turów Power Station is a coal-fired power station in Bogatynia, Poland. The power station, operated by state-owned Polska Grupa Energetyczna via Oddział Elektrownia Turów, is fuelled by lignite extracted from the nearby Turów coal mine. Operations at the plant began in 1962. As of 2021 it supplied 5% of Poland's electricity and is the sole provider of heat and hot water to hospitals, schools and homes in Bogatynia. The plant initially consisted of ten 200 MW units, commissioned from 1962 to 1971. PGE undertook a US$1.6 billion modernization of units 1-6 of the plant in the early 1990s. Units 7-10 have been phased out. Unit 7 was retired in 2003. In 2010, Unit 8 was retired. Units 9 and 10 were decommissioned in 2012-2013. PGE has repowered Units 5 and 6 to co-incinerate biomass, and plans for co-firing of biomass in boilers 1, 2, 3, and 4. Units 1, 2, and 3 have been upgraded from 200 MW to 235 MW each. The plant's remaining six units have a combined capacity of 1,305 MW.",0,1
55000,"The Rondo dwarf galago (Paragalago rondoensis) or Rondo bushbaby is a species of primate in the family Galagidae. The dwarf galagos are the smallest members of the genus Galagoides. It weighs less than 100 grams, making it the smallest known galago. It is endemic to Tanzania where its natural habitat is subtropical or tropical dry forests. It lives in an area reported in 2012 to be less than 100 square kilometers and is threatened by habitat loss due to logging. While it was discovered in the 1950s, the Rondo dwarf galago was deemed data deficient until 1996. In 1996, the Rondo bushbaby was fully described as a species. It is now listed as one of ""The World's 25 Most Endangered Primates."" In 2010, it was also added to the Zoological Society of London's list of genetically distinct and endangered mammals. Description\nBushbabies are small primates with long tails and large ears and eyes. They all have grooming claws, a tooth comb, and a pseudo-tongue. The Rondo bushbaby can be distinguished from other dwarf galagos by its bottle brush tail. The tail is red in young Rondo bushbabies, and darkens with age. It also has a distinctive ""double unit rolling call"". Two soft units comprise the call. The first is a higher pitch sound that can be repeated up to six times at a constant tempo. This forms a phrase.",0,1
28827,"Richard O'Neil Burrell Jr. (born September 24, 1971), better known by his stage name Robb Banks, is an American rapper and actor. He is the founder and head of the record label Robb Bank Records. Banks has released five studio albums: 1998's Robb Bank, 2002's The Great Depression, 2006's The Big Dirty, 2009's American Royalty, and 2013's The Money Store. Banks has also released three mixtapes: 2005's Burn One Down, 2007's God Bless America, and 2009's Gotta Be Real. Banks starred in the 2009 film Get Low alongside Robert Duvall and Sissy Spacek. \n\nBanks was born in Houston, Texas. He moved to Los Angeles at age 16 to pursue a career in rap music. Banks began his career as a member of the hip hop group World Class Wreckin' Cru, alongside artists such as MC Ren and Mac Mall. The group disbanded in 1994. In 1996, Banks released his debut solo album Robb Bank; it peaked at number 55 on the Billboard 200 chart. The following year, he released his second album The Great Depression; it peaked at number 21 on the Billboard 200 chart. Banks released his third album, 2006's The Big Dirty, which peaked at number 12 on the Billboard 200 chart. His fourth album, 2009's American Royalty, peaked at number six on the Billboard 200 chart. His fifth album, 2013's The Money Store, peaked at number one on the Billboard 200 chart. Banks has also released three mixtapes:",1,0
41952,"Coláiste Phádraig (St. Patrick's College) is a Gaelic-medium secondary school in Lucan, County Dublin, Ireland. It was founded in 1879.\n\nThe school is a member of the Irish National Education Council and the Irish Secondary Schools Athletic Association. The school has an annual enrolment of about 980 pupils, who come from all over Dublin and surrounding counties. The school also has a sister school in Douglas, County Cork.\n\nThe college has a strong tradition in Gaelic football and camogie, with many players going on to represent Dublin at senior level. The college has also produced many successful rugby players, including Dan Leavy, Robbie Henshaw and CJ Stander.\n\nThe college has a strong music department with students taking part in music festivals around the country and internationally. Recent graduates have gone on to study music at Trinity College Dublin, the Royal Conservatory of Music and the National University of Ireland, Galway.",1,0
37669,"The Twin Towers II (also known as the Freedom Tower and the New World Trade Center) were a pair of skyscrapers in Lower Manhattan, New York City. They were constructed as part of the New York City skyline, and were the tallest buildings in the world from their completion in 1973 to 1974.\n\nThe towers were designed by architect Minoru Yamasaki and developed by the Port Authority of New York and New Jersey, which leased them to the Metropolitan Life Insurance Company (MetLife) for $3 million per year. The Port Authority also invested more than $2 billion in the project.\n\nOn September 11, 2001, they became the focus of the attacks by Al-Qaeda terrorists who used planes carrying hijacked planes to crash into each tower. The towers collapsed within two hours of each other, resulting in the deaths of more than 2,700 people. The Twin Towers II were replaced by One World Trade Center in 2013.",1,0
...,...,...,...
43402,"Shama District is one of the fourteen districts of the Gambia. The capital is Shama. The district covers an area of 2,655 km², and has a population of 237,811 as at the 2010 census.\n\nThe district is located in the southwestern part of the country and borders Senegal to the south and east, Kolda District to the northeast, Brikama District to the northwest and Serekunda District to the north. The western border of the district is formed by the Gambia River. The main river in the district is the Jamna. Other important rivers are the Banjul and Sine Rivers.\n\nThe climate in Shama District is generally hot and dry with occasional torrential rains during the wet season from July to October. The average annual temperature is around 24°C. The district has two seasons – a dry season from April to July and a wet season from October to March.\n\nThe population of Shama District is predominantly Islamic with Sufi influences prevalent throughout the district. There are also a number of animist groups living in the district. The predominant language spoken in Shama District is Pulaar.\n\nThe economy of Shama District is based on agriculture, fishing and tourism. There are a number of small-scale industries located in the district including food processing, textiles and carpentry. Cotton textile production is a major sector in the district with exports destined for Europe, North America and East Africa. There are also a number of mineral resources such",1,0
45796,"Aileen Clarke Hernandez (May 23, 1926 – February 13, 2017) was an African-American union organizer, civil rights activist, and women's rights activist who served as the president of the National Organization for Women (NOW) between 1970 and 1971. She was also the first woman to serve on the Equal Employment Opportunity Commission. Born in 1926, Hernandez attended Howard University, where her interest in civil rights was cemented in an incident where she was told that she had to hail a ""black"" taxi. After graduating with honors, she became a labor union organizer before helping found NOW. As its second president, she helped organize the Women's Strike for Equality and testified in front of a congressional subcommittee on the Equal Rights Amendment, but she left the organization out of frustration with what she saw as its racial inequities. Hernandez would go on to co-found several organizations that focused on African-American women, along with teaching at several universities in California. She died in 2017 at the age of 90.",0,1
29392,"German submarine U-778 was a Type VIIC U-boat built for Nazi Germany's Kriegsmarine during World War II.\n\nU-778 was laid down on 2 September 1942 at the Flensburger Schiffbau-Gesellschaft yard in Flensburg, Germany, as yard number 927. She was launched on 21 November 1942 and commissioned under the command of ""Oberleutnant zur See"" Dietrich von Choltitz on 5 February 1943.\n\nAfter training with the 8th U-boat Flotilla, U-778 sailed from Kiel on her first patrol on 1 May 1943. Her objective was to sink a convoy of ships travelling from North America to the United Kingdom. After a fruitless three-day search, she surfaced and torpedoed the motor vessel SS Empire Harvest off the coast of County Cork, Ireland. The U-boat then sank the ship with all hands.\n\nU-778 next attacked a convoy of five ships travelling from Gibraltar bound for Liverpool on 4 June 1943. She sank the British freighter MV Empire Pride with all hands off the coast of Portugal.\n\nOn 22 July 1943, U-778 sank the auxiliary cruiser HMAS Perth off the coast of North Africa with all hands after a 16-hour battle.\n\nU-778 ended her first patrol in early August 1943 by sailing into Lorient, France for repairs. She returned to active service on 10 October 1943 and made two more patrols before being damaged in an attack by",1,0
54491,"Robert Parker (June 26, 1796 – November 24, 1865) was a lawyer, judge and political figure in New Brunswick. He represented St. John County in the Legislative Assembly of New Brunswick from 1826 to 1830. Parker was born in Saint John, New Brunswick, the son of Robert Parker and Jane Hatch, and was educated in Saint John and Windsor, Nova Scotia. He went on to study at King's College, then studied law with Ward Chipman, Jr. and was called to the bar in 1820. In the same year, he married Susan Robinson, the niece of John Robinson and became a director and the solicitor for the Bank of New Brunswick. From 1826 to 1834, he practiced law in partnership with his brother Neville. Parker served as attorney general in 1828 following the death of Thomas Wetmore and became solicitor general later that year after Charles Jeffery Peters was named attorney general. He was also named judge commissary in the vice admiralty court. In 1834, he was named puisne judge in the province's Supreme Court. In 1865, he was named Chief Justice but died a few months later in Saint John.",0,1


In [21]:
vocab = vectorizer.vocabulary_
id_to_vocab = {id_: word for word, id_ in vocab.items()}
for i, k in enumerate(all_keys, len(id_to_vocab)):
    id_to_vocab[i] = k

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+5.335,including
+4.625,also
+3.986,has
+3.023,served
+2.898,2006
+2.867,important
+2.652,most
+2.553,born
+2.485,began
+2.404,album


### Only Syn

In [22]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = train_syn.to_numpy()

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = dev_syn.to_numpy()
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.8342465753424657, 0.8218623481781376, 0.8280081577158396, None)

In [23]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.8312210291615325

In [24]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14781,"Turów Power Station is a coal-fired power station in Bogatynia, Poland. The power station, operated by state-owned Polska Grupa Energetyczna via Oddział Elektrownia Turów, is fuelled by lignite extracted from the nearby Turów coal mine. Operations at the plant began in 1962. As of 2021 it supplied 5% of Poland's electricity and is the sole provider of heat and hot water to hospitals, schools and homes in Bogatynia. The plant initially consisted of ten 200 MW units, commissioned from 1962 to 1971. PGE undertook a US$1.6 billion modernization of units 1-6 of the plant in the early 1990s. Units 7-10 have been phased out. Unit 7 was retired in 2003. In 2010, Unit 8 was retired. Units 9 and 10 were decommissioned in 2012-2013. PGE has repowered Units 5 and 6 to co-incinerate biomass, and plans for co-firing of biomass in boilers 1, 2, 3, and 4. Units 1, 2, and 3 have been upgraded from 200 MW to 235 MW each. The plant's remaining six units have a combined capacity of 1,305 MW.",0,1
55000,"The Rondo dwarf galago (Paragalago rondoensis) or Rondo bushbaby is a species of primate in the family Galagidae. The dwarf galagos are the smallest members of the genus Galagoides. It weighs less than 100 grams, making it the smallest known galago. It is endemic to Tanzania where its natural habitat is subtropical or tropical dry forests. It lives in an area reported in 2012 to be less than 100 square kilometers and is threatened by habitat loss due to logging. While it was discovered in the 1950s, the Rondo dwarf galago was deemed data deficient until 1996. In 1996, the Rondo bushbaby was fully described as a species. It is now listed as one of ""The World's 25 Most Endangered Primates."" In 2010, it was also added to the Zoological Society of London's list of genetically distinct and endangered mammals. Description\nBushbabies are small primates with long tails and large ears and eyes. They all have grooming claws, a tooth comb, and a pseudo-tongue. The Rondo bushbaby can be distinguished from other dwarf galagos by its bottle brush tail. The tail is red in young Rondo bushbabies, and darkens with age. It also has a distinctive ""double unit rolling call"". Two soft units comprise the call. The first is a higher pitch sound that can be repeated up to six times at a constant tempo. This forms a phrase.",0,1
11941,"Kitty Linn O'Neil (March 24, 1946 – November 2, 2018) was an American stuntwoman and racer, known as ""the fastest woman in the world"". An illness in early childhood left her deaf, and more illnesses in early adulthood cut short a career in diving. O'Neil's career as a stuntwoman and race driver led to her depiction in a television movie and as an action figure. Her women's absolute land speed record stood until 2019. Early life\nKitty Linn O'Neil was born in Corpus Christi, Texas on March 24, 1946. John O'Neil, her father, was an officer in the United States Army Air Forces, who had been an oil wildcatter. He died in an airplane crash during Kitty's childhood. Her mother, Patsy Compton O'Neil, was native Cherokee. At five months of age, O'Neil contracted simultaneous childhood diseases, losing her hearing. After her deafness became apparent at the age of two, her mother taught her lip-reading and speech, eventually becoming a speech therapist and co-founding a school for students with hearing impairment in Wichita Falls, Texas.",0,1
28827,"Richard O'Neil Burrell Jr. (born September 24, 1971), better known by his stage name Robb Banks, is an American rapper and actor. He is the founder and head of the record label Robb Bank Records. Banks has released five studio albums: 1998's Robb Bank, 2002's The Great Depression, 2006's The Big Dirty, 2009's American Royalty, and 2013's The Money Store. Banks has also released three mixtapes: 2005's Burn One Down, 2007's God Bless America, and 2009's Gotta Be Real. Banks starred in the 2009 film Get Low alongside Robert Duvall and Sissy Spacek. \n\nBanks was born in Houston, Texas. He moved to Los Angeles at age 16 to pursue a career in rap music. Banks began his career as a member of the hip hop group World Class Wreckin' Cru, alongside artists such as MC Ren and Mac Mall. The group disbanded in 1994. In 1996, Banks released his debut solo album Robb Bank; it peaked at number 55 on the Billboard 200 chart. The following year, he released his second album The Great Depression; it peaked at number 21 on the Billboard 200 chart. Banks released his third album, 2006's The Big Dirty, which peaked at number 12 on the Billboard 200 chart. His fourth album, 2009's American Royalty, peaked at number six on the Billboard 200 chart. His fifth album, 2013's The Money Store, peaked at number one on the Billboard 200 chart. Banks has also released three mixtapes:",1,0
41952,"Coláiste Phádraig (St. Patrick's College) is a Gaelic-medium secondary school in Lucan, County Dublin, Ireland. It was founded in 1879.\n\nThe school is a member of the Irish National Education Council and the Irish Secondary Schools Athletic Association. The school has an annual enrolment of about 980 pupils, who come from all over Dublin and surrounding counties. The school also has a sister school in Douglas, County Cork.\n\nThe college has a strong tradition in Gaelic football and camogie, with many players going on to represent Dublin at senior level. The college has also produced many successful rugby players, including Dan Leavy, Robbie Henshaw and CJ Stander.\n\nThe college has a strong music department with students taking part in music festivals around the country and internationally. Recent graduates have gone on to study music at Trinity College Dublin, the Royal Conservatory of Music and the National University of Ireland, Galway.",1,0
...,...,...,...
7027,"Ilya Ilich Chernyaev (21 January 1893 – 10 March 1972) was a Russian communist politician and a leading figure in Soviet foreign policy during the Cold War. He served as the Soviet ambassador to the United States from 1946 to 1950, then as Foreign Minister from 1951 to 1957.\n\nBorn into a working-class family in the town of Kiev in what was then the Russian Empire, Chernyaev became a Marxist revolutionary while studying at Moscow State University. He was arrested and spent five years in prison before escaping to France in 1924. He returned to Russia after the Russian Revolution and became a leading member of the Communist Party of the Soviet Union (CPSU). In 1939 he was appointed Deputy Commissar for Foreign Affairs, serving in that position until 1946.\n\nChernyaev played an important role in Soviet policy during the Cold War, helping to maintain good relations with the United States despite ideological differences. He also played a leading role in negotiations over the Berlin Blockade and the Suez Crisis. After his retirement from politics, Chernyaev served as President of the International Academy of Sciences (1962–1965). He was awarded the Lenin Prize in 1965. Ilya Ilich Chernyaev died in 1972.",1,0
54491,"Robert Parker (June 26, 1796 – November 24, 1865) was a lawyer, judge and political figure in New Brunswick. He represented St. John County in the Legislative Assembly of New Brunswick from 1826 to 1830. Parker was born in Saint John, New Brunswick, the son of Robert Parker and Jane Hatch, and was educated in Saint John and Windsor, Nova Scotia. He went on to study at King's College, then studied law with Ward Chipman, Jr. and was called to the bar in 1820. In the same year, he married Susan Robinson, the niece of John Robinson and became a director and the solicitor for the Bank of New Brunswick. From 1826 to 1834, he practiced law in partnership with his brother Neville. Parker served as attorney general in 1828 following the death of Thomas Wetmore and became solicitor general later that year after Charles Jeffery Peters was named attorney general. He was also named judge commissary in the vice admiralty court. In 1834, he was named puisne judge in the province's Supreme Court. In 1865, he was named Chief Justice but died a few months later in Saint John.",0,1
50921,"The Two Ewalds (or Two Hewalds) were Saint Ewald the Black and Saint Ewald the Fair, martyrs in Old Saxony about 692. Both bore the same name, but were distinguished by the difference in the colour of their hair and complexions. They began their mission labours about 690 at the ancient Saxons country, now part of Westphalia, and covered by the dioceses of Münster, Osnabrück, and Paderborn. They are honored as saints in Westphalia. Background\nThe two priests were companions, both natives of Northumbria, England. According to the example of many at that time, they spent several years as students in the schools of Ireland. Ewald the Black was the more learned of the two, but both were equally renowned for holiness of life. They were apparently acquainted with St. Willibrord, the Apostle of Friesland, and were animated with his zeal for the conversion of the Germans. Some sources number them among the eleven companions of that saint. More probably, however, they set out from England after St. Willibrord's departure, in an attempt to convert their own cousins in Old Saxony.",0,1
53239,"The North London Hospice (NLH) is a registered charity offering hospice care to patients with life-limiting and terminal illnesses. It was founded in 1984 in response to the lack of aftercare for patients being discharged from hospital in north London following the closure of St. Columbus Hospital in 1981, which had been north London's only long-stay hospital. The north London Hospice was the United Kingdom's first multi-faith hospice. It provides its specialist palliative and end-of-life care to people within the boroughs of Barnet, Enfield and Haringey. This takes place at its Finchley in-patient unit, which was opened in 1992, it's Health & Wellbeing Centre in Winchmore Hill and the majority of care is provided to people at home. North London Hospice welcomes people from all faiths and communities, and those of no faith. The hospice provides physical, emotional and spiritual care to more than 3,500 patients a year (2020/21) and supports their families, friends and carers.",0,1


In [25]:
id_to_vocab = {}
for i, k in enumerate(all_keys):
    id_to_vocab[i] = k

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+1.079,root
+0.592,punct
+0.481,obl
+0.312,aux
+0.281,nmod
+0.224,cc
+0.211,mark
+0.196,nmod:poss
+0.188,obl:agent
+0.183,aux:pass
