In [2]:
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
model = DistilBertForSequenceClassification.from_pretrained("./sentiment_model_custom")
tokenizer = DistilBertTokenizer.from_pretrained("./sentiment_model_custom")
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

classifier = pipeline(
    "text-classification", 
    model=model, 
    tokenizer=tokenizer, 
    device="cuda" if torch.cuda.is_available() else "cpu",
    truncation=True,
    max_length=512
)


# Classifying large text using a sliding window
def classify_long_text(text, window_size=128, stride=64):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    logits = []
    
    # Split tokens into chunks of length 128, classify chunk, slide window by 64 tokens
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + window_size]
        chunk = tokenizer.decode(chunk, skip_special_tokens=True)
        outputs = classifier(chunk, return_all_scores=True)
        logits.append(torch.tensor([score['score'] for score in outputs[0]]))
    
    avg_logits = torch.mean(torch.stack(logits), dim=0)
    return torch.argmax(avg_logits).item(), avg_logits

long_text = "WHEN a traffic signal at a busy intersection flashes green and red at the same time, it can be a white-knuckle moment -- unless you stop, look carefully, and make your own considered judgment whether it's safe to proceed. \
        Signals from the economy have been a bit like that lately. The bond market has been predicting a recession while the stock market has been signaling strong growth ahead. And economists are divided. Some are forecasting a nasty downturn \
        in 2007 while others expect the expansion to remain firmly on track. Given these mixed signals, it's wise to spend a little extra effort to figure out what's actually happening. You may find the exercise encouraging.\
        First, a bit of history as a guide. This month marks the current economic expansion's fifth birthday, an advanced age that might suggest that a slowdown is coming soon. In the past 25 years or so, though, major structural changes in the economy\
         -- including a shift in emphasis from manufacturing to services, and growing use of information technology to manage inventories and operations -- have lengthened the business cycle.\
        The more proactive approach of the Federal Reserve has also helped. The last two economic cycles lasted 10 years each, much longer than many previous ones. If recent history is a guide, we are in the middle of the current business cycle, with years\
        of prosperity ahead. That would be consistent with the conflicting signs in the data and the markets. About midway through a typical business cycle, there is often a shift in the sources of growth -- away from interest-rate-sensitive sectors like \
        housing and autos, and toward others like business investment. But any transition is tricky, and the changeover is not always smooth. A bumpy economic transition seems under way now. Growth has slowed to an annual rate of 1.6 percent in the third \
        quarter from 5.6 percent in the first quarter, as activity in housing and autos has cooled, while business investment hasn't surged. Still, there are good reasons to be optimistic about where this transition will take us. For starters, a lot is \
        going right in the economy. The corporate sector is enjoying record levels of cash and strong profits, and the stock market has been climbing. The global economy is humming along, and the outlook for Europe and Asia is upbeat. Crude oil has come \
        down sharply, to less than $60 a barrel from more than $70. And the latest inflation figures are benign. The Producer Price Index fell 1.6 percent in October, with core prices excluding food and energy dropping 0.9 percent; the Consumer Price Index \
        also declined, by 0.5 percent. That isn't all. The capacity utilization rate -- an indicator of production levels in factories and mines -- has begun to drift downward from peak levels, in line with moderate growth and easing inflationary pressures. \
        At the same time, employment is robust, with the jobless rate at 4.4 percent last month. And the Federal Reserve has stopped raising rates, at least for now. Still, the economy faces risks. In the bond market, the yield curve is inverted, a classic sign \
        of trouble. An inverted yield curve occurs when short-term market interest rates are higher than long-term rates, often indicating that investors expect a recession. One possible reason for pessimism is the downturn in the housing market, which could hurt \
        consumer spending. That, in turn, could cause corporate America to cut back its spending and expansion plans, sending the economy into recession -- or so it is feared. (Consumer spending, though, has held up well so far, probably because of falling gas \
        prices and the strong labor market.) The housing slump's effect on construction has been significant. According to government figures, construction of new single-family homes and apartments dropped to an annual rate of 1.486 million units last month; \
        that is down 14.6 percent from September, and is the lowest level in more than six years. Home builders like Beazer Homes and Toll Brothers have said they still don't see any bottom in the market. But even if the housing slowdown hurts consumer \
        spending in the coming months, economists like Mark Zandi of Moody's Economy.com doubt that it will derail the expansion. 'The only way the housing correction can undermine the economy is if it first infects financial markets, \
        'Mr. Zandi said. 'For that to occur we'd need a financial event which hurts global investor confidence or the banking sector directly, and becomes a global problem.' It is possible to imagine such a situation. \
        'Many people including myself believe that the market is underpricing risk,' said Martin N. Baily, senior fellow at the Peterson Institute, a research organization based in Washington. \
        'It looks like some institutions may be taking on more risk than they realize, and if there are defaults or asset price declines then they may get into trouble and it might spread.' In particular, some economists have been concerned that as \
        the housing market softens, lenders have issued mortgages to people who can't afford to pay them back. If that results in a large number of defaults and losses, the fast-growing market in subprime mortgage-backed securities could be undermined. \
        'Credit risk is increasingly being borne by securities holders, in particular those holding collateralized debt obligations,' said Jonathan Reiss, the principal of Analytical Synthesis, a research group based in New York. 'The result is that the \
        originators of the loan are much less exposed to the consequences than they were even, say, five years ago, and there's a feeling that somehow this could all end badly.' BUT Mr. Reiss and others, while appreciating the risk of a financial market crisis, \
        are not predicting one. What's more, it's hard to figure the economic consequences of a financial crisis. The last major one, which began with the Asian crisis in 1997 and culminated in the near-collapse of Long Term Capital Management in 1998, didn't \
        derail the economic expansion of the '90s. And even the combination of a bursting stock bubble and Sept. 11, 2001, caused only a brief setback for the overall economy. So, at the moment, cautious optimism seems the most sensible approach. Mixed signals \
        from the markets and divided opinions from the experts may be unsettling, but they are consistent with the prospect of a long stretch of prosperity ahead. Perhaps we shouldn't really worry until all the signs begin to point the same way."

prediction, scores = classify_long_text(long_text)
print(label_map[prediction], scores)

Device set to use cuda
Token indices sequence length is longer than the specified maximum sequence length for this model (1334 > 512). Running this sequence through the model will result in indexing errors


neutral tensor([0.3291, 0.3548, 0.3161])


In [6]:
# Compare to classification without sliding window
print(classifier(long_text, return_all_scores=True))

[[{'label': 'negative', 'score': 0.2467794120311737}, {'label': 'neutral', 'score': 0.29594483971595764}, {'label': 'positive', 'score': 0.45727577805519104}]]




In [None]:
# Understanding tokenization a little more
tokens = tokenizer.tokenize("hypersensitivity")
print(tokens)
print(f"Tokens: {len(tokens)}")

['hyper', '##sen', '##sit', '##ivity']
Tokens: 4
